In [1]:
import calendar
import time
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import missingno as msno
from datetime import datetime
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import kendalltau
import warnings
#matplotlib.style.use('ggplot')
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
warnings.filterwarnings("ignore")
%matplotlib inline

color = sns.color_palette("hls", 8)
sns.set_style("whitegrid")
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.offline as offline
import plotly.graph_objs as go
# import cufflinks and offline mode
import cufflinks as cf
cf.go_offline()


print('plotly version:', __version__) 
init_notebook_mode(connected=True)
('plotly version:', '2.2.1')

Dataset

In [2]:
train_df = pd.read_csv("train_2016_v2.csv", parse_dates=["transactiondate"])
prop_df = pd.read_csv("properties_2016.csv")
In [3]:
print ("Shape Of Train: ", train_df.shape)
print ("Shape Of Properties: ", prop_df.shape)
('Shape Of Train: ', (90275, 3))
('Shape Of Properties: ', (2985217, 58))
In [4]:
train_df.head()
Out[4]:
parcelid logerror transactiondate
0 11016594 0.0276 2016-01-01
1 14366692 -0.1684 2016-01-01
2 12098116 -0.0040 2016-01-01
3 12643413 0.0218 2016-01-02
4 14432541 -0.0050 2016-01-02
In [5]:
prop_df.head()
Out[5]:
parcelid airconditioningtypeid architecturalstyletypeid basementsqft bathroomcnt bedroomcnt buildingclasstypeid buildingqualitytypeid calculatedbathnbr decktypeid finishedfloor1squarefeet calculatedfinishedsquarefeet finishedsquarefeet12 finishedsquarefeet13 finishedsquarefeet15 finishedsquarefeet50 finishedsquarefeet6 fips fireplacecnt fullbathcnt garagecarcnt garagetotalsqft hashottuborspa heatingorsystemtypeid latitude longitude lotsizesquarefeet poolcnt poolsizesum pooltypeid10 pooltypeid2 pooltypeid7 propertycountylandusecode propertylandusetypeid propertyzoningdesc rawcensustractandblock regionidcity regionidcounty regionidneighborhood regionidzip roomcnt storytypeid threequarterbathnbr typeconstructiontypeid unitcnt yardbuildingsqft17 yardbuildingsqft26 yearbuilt numberofstories fireplaceflag structuretaxvaluedollarcnt taxvaluedollarcnt assessmentyear landtaxvaluedollarcnt taxamount taxdelinquencyflag taxdelinquencyyear censustractandblock
0 10754147 NaN NaN NaN 0.0 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6037.0 NaN NaN NaN NaN NaN NaN 34144442.0 -118654084.0 85768.0 NaN NaN NaN NaN NaN 010D 269.0 NaN 6.037800e+07 37688.0 3101.0 NaN 96337.0 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 9.0 2015.0 9.0 NaN NaN NaN NaN
1 10759547 NaN NaN NaN 0.0 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6037.0 NaN NaN NaN NaN NaN NaN 34140430.0 -118625364.0 4083.0 NaN NaN NaN NaN NaN 0109 261.0 LCA11* 6.037800e+07 37688.0 3101.0 NaN 96337.0 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 27516.0 2015.0 27516.0 NaN NaN NaN NaN
2 10843547 NaN NaN NaN 0.0 0.0 NaN NaN NaN NaN NaN 73026.0 NaN NaN 73026.0 NaN NaN 6037.0 NaN NaN NaN NaN NaN NaN 33989359.0 -118394633.0 63085.0 NaN NaN NaN NaN NaN 1200 47.0 LAC2 6.037703e+07 51617.0 3101.0 NaN 96095.0 0.0 NaN NaN NaN 2.0 NaN NaN NaN NaN NaN 650756.0 1413387.0 2015.0 762631.0 20800.37 NaN NaN NaN
3 10859147 NaN NaN NaN 0.0 0.0 3.0 7.0 NaN NaN NaN 5068.0 NaN NaN 5068.0 NaN NaN 6037.0 NaN NaN NaN NaN NaN NaN 34148863.0 -118437206.0 7521.0 NaN NaN NaN NaN NaN 1200 47.0 LAC2 6.037141e+07 12447.0 3101.0 27080.0 96424.0 0.0 NaN NaN NaN NaN NaN NaN 1948.0 1.0 NaN 571346.0 1156834.0 2015.0 585488.0 14557.57 NaN NaN NaN
4 10879947 NaN NaN NaN 0.0 0.0 4.0 NaN NaN NaN NaN 1776.0 NaN NaN 1776.0 NaN NaN 6037.0 NaN NaN NaN NaN NaN NaN 34194168.0 -118385816.0 8512.0 NaN NaN NaN NaN NaN 1210 31.0 LAM1 6.037123e+07 12447.0 3101.0 46795.0 96450.0 0.0 NaN NaN NaN 1.0 NaN NaN 1947.0 NaN NaN 193796.0 433491.0 2015.0 239695.0 5725.17 NaN NaN NaN

List of real estate properties in 3 counties (Los Angeles, Orange and Ventura, California) data in 2016.

90,275 rows in train, 2,985,217 rows in properties file. Merge 2 files and then carry out analysis.

In [6]:
train_df = pd.merge(train_df,prop_df,on="parcelid",how="left")
In [7]:
train_df.head()
Out[7]:
parcelid logerror transactiondate airconditioningtypeid architecturalstyletypeid basementsqft bathroomcnt bedroomcnt buildingclasstypeid buildingqualitytypeid calculatedbathnbr decktypeid finishedfloor1squarefeet calculatedfinishedsquarefeet finishedsquarefeet12 finishedsquarefeet13 finishedsquarefeet15 finishedsquarefeet50 finishedsquarefeet6 fips fireplacecnt fullbathcnt garagecarcnt garagetotalsqft hashottuborspa heatingorsystemtypeid latitude longitude lotsizesquarefeet poolcnt poolsizesum pooltypeid10 pooltypeid2 pooltypeid7 propertycountylandusecode propertylandusetypeid propertyzoningdesc rawcensustractandblock regionidcity regionidcounty regionidneighborhood regionidzip roomcnt storytypeid threequarterbathnbr typeconstructiontypeid unitcnt yardbuildingsqft17 yardbuildingsqft26 yearbuilt numberofstories fireplaceflag structuretaxvaluedollarcnt taxvaluedollarcnt assessmentyear landtaxvaluedollarcnt taxamount taxdelinquencyflag taxdelinquencyyear censustractandblock
0 11016594 0.0276 2016-01-01 1.0 NaN NaN 2.0 3.0 NaN 4.0 2.0 NaN NaN 1684.0 1684.0 NaN NaN NaN NaN 6037.0 NaN 2.0 NaN NaN NaN 2.0 34280990.0 -118488536.0 7528.0 NaN NaN NaN NaN NaN 0100 261.0 LARS 6.037107e+07 12447.0 3101.0 31817.0 96370.0 0.0 NaN NaN NaN 1.0 NaN NaN 1959.0 NaN NaN 122754.0 360170.0 2015.0 237416.0 6735.88 NaN NaN 6.037107e+13
1 14366692 -0.1684 2016-01-01 NaN NaN NaN 3.5 4.0 NaN NaN 3.5 NaN NaN 2263.0 2263.0 NaN NaN NaN NaN 6059.0 NaN 3.0 2.0 468.0 NaN NaN 33668120.0 -117677556.0 3643.0 NaN NaN NaN NaN NaN 1 261.0 NaN 6.059052e+07 32380.0 1286.0 NaN 96962.0 0.0 NaN 1.0 NaN NaN NaN NaN 2014.0 NaN NaN 346458.0 585529.0 2015.0 239071.0 10153.02 NaN NaN NaN
2 12098116 -0.0040 2016-01-01 1.0 NaN NaN 3.0 2.0 NaN 4.0 3.0 NaN NaN 2217.0 2217.0 NaN NaN NaN NaN 6037.0 NaN 3.0 NaN NaN NaN 2.0 34136312.0 -118175032.0 11423.0 NaN NaN NaN NaN NaN 0100 261.0 PSR6 6.037464e+07 47019.0 3101.0 275411.0 96293.0 0.0 NaN NaN NaN 1.0 NaN NaN 1940.0 NaN NaN 61994.0 119906.0 2015.0 57912.0 11484.48 NaN NaN 6.037464e+13
3 12643413 0.0218 2016-01-02 1.0 NaN NaN 2.0 2.0 NaN 4.0 2.0 NaN NaN 839.0 839.0 NaN NaN NaN NaN 6037.0 NaN 2.0 NaN NaN NaN 2.0 33755800.0 -118309000.0 70859.0 NaN NaN NaN NaN NaN 010C 266.0 LAR3 6.037296e+07 12447.0 3101.0 54300.0 96222.0 0.0 NaN NaN NaN 1.0 NaN NaN 1987.0 NaN NaN 171518.0 244880.0 2015.0 73362.0 3048.74 NaN NaN 6.037296e+13
4 14432541 -0.0050 2016-01-02 NaN NaN NaN 2.5 4.0 NaN NaN 2.5 NaN NaN 2283.0 2283.0 NaN NaN NaN NaN 6059.0 NaN 2.0 2.0 598.0 NaN NaN 33485643.0 -117700234.0 6000.0 1.0 NaN NaN NaN 1.0 122 261.0 NaN 6.059042e+07 17686.0 1286.0 NaN 96961.0 8.0 NaN 1.0 NaN NaN NaN NaN 1981.0 2.0 NaN 169574.0 434551.0 2015.0 264977.0 5488.96 NaN NaN 6.059042e+13

Visualizing Datatypes

In [8]:
pd.options.display.max_rows = 65

dtype_df = train_df.dtypes.reset_index()
dtype_df.columns = ["Count", "Column Type"]
dtype_df
Out[8]:
Count Column Type
0 parcelid int64
1 logerror float64
2 transactiondate datetime64[ns]
3 airconditioningtypeid float64
4 architecturalstyletypeid float64
5 basementsqft float64
6 bathroomcnt float64
7 bedroomcnt float64
8 buildingclasstypeid float64
9 buildingqualitytypeid float64
10 calculatedbathnbr float64
11 decktypeid float64
12 finishedfloor1squarefeet float64
13 calculatedfinishedsquarefeet float64
14 finishedsquarefeet12 float64
15 finishedsquarefeet13 float64
16 finishedsquarefeet15 float64
17 finishedsquarefeet50 float64
18 finishedsquarefeet6 float64
19 fips float64
20 fireplacecnt float64
21 fullbathcnt float64
22 garagecarcnt float64
23 garagetotalsqft float64
24 hashottuborspa object
25 heatingorsystemtypeid float64
26 latitude float64
27 longitude float64
28 lotsizesquarefeet float64
29 poolcnt float64
30 poolsizesum float64
31 pooltypeid10 float64
32 pooltypeid2 float64
33 pooltypeid7 float64
34 propertycountylandusecode object
35 propertylandusetypeid float64
36 propertyzoningdesc object
37 rawcensustractandblock float64
38 regionidcity float64
39 regionidcounty float64
40 regionidneighborhood float64
41 regionidzip float64
42 roomcnt float64
43 storytypeid float64
44 threequarterbathnbr float64
45 typeconstructiontypeid float64
46 unitcnt float64
47 yardbuildingsqft17 float64
48 yardbuildingsqft26 float64
49 yearbuilt float64
50 numberofstories float64
51 fireplaceflag object
52 structuretaxvaluedollarcnt float64
53 taxvaluedollarcnt float64
54 assessmentyear float64
55 landtaxvaluedollarcnt float64
56 taxamount float64
57 taxdelinquencyflag object
58 taxdelinquencyyear float64
59 censustractandblock float64
In [9]:
dtype_df.groupby("Column Type").aggregate('count').reset_index()
Out[9]:
Column Type Count
0 int64 1
1 float64 53
2 datetime64[ns] 1
3 object 5
In [10]:
dataTypeDf = pd.DataFrame(train_df.dtypes.value_counts()).reset_index().rename(columns={"index":"variableType",0:"count"})
In [11]:
# Create a trace
trace = go.Bar(
    x = dataTypeDf["variableType"].astype(str),
    y = dataTypeDf["count"],
)

data = [trace]

# Edit the layout
layout = dict(title = "Variables Count Across Datatype",
              xaxis = dict(title = "VariableType"),
              yaxis = dict(title = "Count"),
              font = dict(size=15),
              autosize = False,
              width = 800,
              height = 500,
              )

fig = dict(data=data, layout=layout)

iplot(fig)
In [12]:
missing_df         = train_df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']

missing_df['missing_ratio'] = missing_df['missing_count'] / train_df.shape[0]
missing_df.loc[missing_df['missing_ratio']>0.995]
Out[12]:
column_name missing_count missing_ratio
4 architecturalstyletypeid 90014 0.997109
5 basementsqft 90232 0.999524
8 buildingclasstypeid 90259 0.999823
15 finishedsquarefeet13 90242 0.999634
18 finishedsquarefeet6 89854 0.995336
43 storytypeid 90232 0.999524
45 typeconstructiontypeid 89976 0.996688
48 yardbuildingsqft26 90180 0.998948
51 fireplaceflag 90053 0.997541

4 columns have missing values 99.9% of the times.!

Logerror:

Target variable for this competition is "logerror" field.

In [13]:
# Create a trace
trace = go.Scatter(
    x = range(train_df.shape[0]),
    y = np.sort(train_df.logerror.values),
    #mode= 'markers',
    #marker= dict(size= 4,                 line= dict(width=1),                 opacity= 0.3,                )
)

data = [trace]

# Edit the layout
layout = dict(title = 'Logerror distribution',
              xaxis = dict(title = 'index'),
              yaxis = dict(title = 'logerror'),
              font = dict(size=16),
              autosize = False,
              width = 600,
              height = 500,
              )

fig = dict(data=data, layout=layout)

iplot(fig)

Outliers at both the ends!

Remove the outliers and then do a histogram plot on the same.

In [14]:
ulimit = np.percentile(train_df.logerror.values, 99)
llimit = np.percentile(train_df.logerror.values, 1)
train_df['logerror'].ix[train_df['logerror']>ulimit] = ulimit
train_df['logerror'].ix[train_df['logerror']<llimit] = llimit
In [15]:
import plotly.figure_factory as ff
hist_data = [train_df.logerror.values]

group_labels = ['logerror']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, bin_size=.01)

# Plot!
iplot(fig)
In [16]:
data = [go.Histogram(x=train_df.logerror.values)]
iplot(data)

Transaction Date:

Date field. Check number of transactions in each month.

In [17]:
train_df['transaction_month'] = train_df['transactiondate'].dt.month

cnt_srs = train_df['transaction_month'].value_counts()
In [18]:
data = [
    go.Bar(
        x=cnt_srs.index, 
        y=cnt_srs.values,
    )]

# Edit the layout
layout = dict(title = 'Transaction distribution',
              xaxis = dict(title = 'Month of transaction'),
              yaxis = dict(title = 'Number of Occurrences'),
              font  = dict(size=16),
              )

fig = dict(data=data, layout=layout)
iplot(fig)

Train data has all transactions before October 15, 2016, and some of the transactions after October 15, 2016.

So shorter bars in last 3 months.

Parcel Id:

In [19]:
(train_df['parcelid'].value_counts().reset_index())['parcelid'].value_counts()
Out[19]:
1    90026
2      123
3        1
Name: parcelid, dtype: int64

Most parcel ids are appearing only once in the dataset.

Missing Value Analysis

In [20]:
missingValueColumns = train_df.columns[train_df.isnull().any()]
msno.bar(train_df[missingValueColumns],\
            figsize=(20,8),color='blue',fontsize=12,labels=True)
In [21]:
missing_df         = prop_df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df         = missing_df.loc[missing_df['missing_count']>0]
missing_df         = missing_df.sort_values(by='missing_count')
In [22]:
data = [go.Bar(
            x = missing_df.missing_count.values,
            y = missing_df.column_name,
            orientation = 'h',
            )]

# Edit the layout
layout = dict(title = "Number of missing values in each column",
              xaxis = dict(title = "Count of missing values"),
              yaxis = dict(tickangle=35, 
                           tickfont=dict(size=9)),
              font  = dict(size=8),
              autosize = False,
              width = 900,
              height = 990,
              )

fig = dict(data=data, layout=layout)
iplot(fig)

Univariate Analysis:

Since there are so many variables, investigate 'float' variables alone and then get the correlation with the target variable.

In [23]:
# Let us just impute the missing values with mean values to compute correlation coefficients #
mean_values  = train_df.mean(axis=0)
train_df_new = train_df.fillna(mean_values, inplace=True)

# Now let us look at the correlation coefficient of each of these variables #
x_cols = [col for col in train_df_new.columns if col not in ['logerror'] if train_df_new[col].dtype=='float64']

labels = []
values = []
for col in x_cols:
    labels.append(col)
    values.append(np.corrcoef(train_df_new[col].values, train_df_new.logerror.values)[0,1])
corr_df = pd.DataFrame({'col_labels':labels, 'corr_values':values})
corr_df = corr_df.sort_values(by='corr_values')
In [24]:
data = [go.Bar(
            x = np.array(corr_df.corr_values.values),
            y = corr_df['col_labels'],
            orientation = 'h',
            )]

# Edit the layout
layout = dict(title = "Correlation coefficient of the variables",
              xaxis = dict(title = "Correlation coefficient"),
              yaxis = dict(tickangle=35, 
                           tickfont=dict(size=9)),
              font  = dict(size=12),
              autosize = False,
              width = 900,
              height = 990,
              )

fig = dict(data=data, layout=layout)
iplot(fig)

The correlation of the target variable with the given set of variables is low overall.

A few variables at the top of this graph has no correlation values. There may be only one unique value and hence no correlation value.

In [25]:
corr_zero_cols = ['assessmentyear', 'storytypeid', 'pooltypeid2', 'pooltypeid7', 'pooltypeid10', 'poolcnt', 'decktypeid', 'buildingclasstypeid']
for col in corr_zero_cols:
    print(col, len(train_df_new[col].unique()))
('assessmentyear', 1)
('storytypeid', 1)
('pooltypeid2', 1)
('pooltypeid7', 1)
('pooltypeid10', 1)
('poolcnt', 1)
('decktypeid', 1)
('buildingclasstypeid', 1)

Check out variables with high correlation values.

In [26]:
corr_df_sel = corr_df.ix[(corr_df['corr_values']>0.02) | (corr_df['corr_values'] < -0.01)]
corr_df_sel
Out[26]:
col_labels corr_values
49 taxamount -0.014768
21 heatingorsystemtypeid -0.013732
43 yearbuilt 0.021171
4 bedroomcnt 0.032035
18 fullbathcnt 0.034267
7 calculatedbathnbr 0.036019
3 bathroomcnt 0.036862
10 calculatedfinishedsquarefeet 0.047659
11 finishedsquarefeet12 0.048611
In [27]:
cols_to_use = corr_df_sel.col_labels.tolist()

temp_df = train_df[cols_to_use]
corrmat = temp_df.corr(method='spearman')
In [28]:
trace = go.Heatmap(z=np.array(corrmat),
                   x = cols_to_use,
                   y = cols_to_use,
                   #colorscale= 'Jet')
                   colorscale=[[0.0000000000000000, 'rgb(165,0,38)'],    [0.1111111111111111, 'rgb(215,48,39)'],
                              [0.2222222222222222, 'rgb(244,109,67)'],  [0.3333333333333333, 'rgb(253,174,97)'], 
                              [0.4444444444444444, 'rgb(254,224,144)'], [0.5555555555555556, 'rgb(224,243,248)'], 
                              [0.6666666666666666, 'rgb(171,217,233)'], [0.7777777777777778, 'rgb(116,173,209)'], 
                              [0.8888888888888888, 'rgb(69,117,180)'],  [1.0000000000000000, 'rgb(49,54,149)']],)
                   #colorscale=[[1.0000000000000000, 'rgb(165,0,38)'],    [0.8888888888888888, 'rgb(215,48,39)'],
                   #            [0.7777777777777778, 'rgb(244,109,67)'],  [0.6666666666666666, 'rgb(253,174,97)'], 
                   #            [0.5555555555555556, 'rgb(254,224,144)'], [0.4444444444444444, 'rgb(224,243,248)'], 
                   #            [0.3333333333333333, 'rgb(171,217,233)'], [0.2222222222222222, 'rgb(116,173,209)'], 
                   #            [0.1111111111111111, 'rgb(69,117,180)'],  [0.0000000000000000, 'rgb(49,54,149)']],)
data=[trace]

# Edit the layout
layout = dict(title = "Important variables correlation map",
              font  = dict(size=12),
              autosize = False,
              width = 500,
              height = 500,
              )

fig = dict(data=data, layout=layout)
iplot(fig)

Let us now look at each of them. Investigate individually.

Bathroom Count:

In [29]:
data = [
    go.Histogram(
        x=train_df['bathroomcnt'],
        histnorm='count',
        #marker=dict(colorscale='Jet',),
        #opacity=0.75
    )]

# Edit the layout
layout = dict(title = 'Frequency of Bathroom count',
              xaxis = dict(title = 'Bathroom'),
              yaxis = dict(title = 'Count'),
              font  = dict(size=16),
              autosize = False,
              width = 800,
              height = 500,
              bargap=0.2,
              )

fig = dict(data=data, layout=layout)
iplot(fig)
In [30]:
src = list(train_df['bathroomcnt'].values) 
result_dict = dict( [ (i, src.count(i)) for i in set(src) ] )
f = train_df.sort_values(by=['bathroomcnt'], ascending=[True])
N = len(result_dict)    # Number of boxes
nbathroom = sorted(result_dict.keys())
# generate an array of rainbow colors by fixing the saturation and lightness of the HSL representation of colour 
# and marching around the hue. 
# Plotly accepts any CSS color format, see e.g. http://www.w3schools.com/cssref/css_colors_legal.asp.
c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, N)]

# Each box is represented by a dict that contains the data, the type, and the colour. 
# Use list comprehension to describe N boxes, each with a different colour and with different randomly generated data:
data = [{
    'x': nbathroom[i],
    'y': f['logerror'][f['bathroomcnt']==nbathroom[i]],
    'name':nbathroom[i],
    'type':'box',
    'marker':{'color': c[i]}
    } for i in range(int(N))]

# format the layout
#layout = {'xaxis': {'showgrid':False,'zeroline':False, 'tickangle':60,'showticklabels':False},
#          'yaxis': {'zeroline':False,'gridcolor':'white'},
#          'paper_bgcolor': 'rgb(233,233,233)',
#          'plot_bgcolor': 'rgb(233,233,233)',
#          }

# Edit the layout
layout = dict(title = "Logerror vs Bathroom count",
              xaxis = dict(title = "Bathroom count"),
              yaxis = dict(title = "Logerror",
                           tickangle=0, 
                           tickfont=dict(size=9)),
              font  = dict(size=12),
              autosize = False,
              width = 900,
              height = 700,
              )

fig = dict(data=data, layout=layout)
iplot(fig)
In [31]:
data = [
    go.Histogram(
        x=train_df['bedroomcnt'],
        histnorm='count',
        #marker=dict(colorscale='Jet',),
        #opacity=0.75
    )]

# Edit the layout
layout = dict(title = 'Frequency of Bedroom count',
              xaxis = dict(title = 'Bedroom count'),
              yaxis = dict(title = 'Frequency'),
              font  = dict(size=16),
              autosize = False,
              width = 800,
              height = 500,
              bargap=0.2,
              )

fig = dict(data=data, layout=layout)
iplot(fig)

3.03 is the mean value with which we replaced the Null values.

In [32]:
train_df['bedroomcnt'].ix[train_df['bedroomcnt']>7] = 7
plt.figure(figsize=(12,8))
sns.violinplot(x='bedroomcnt', y='logerror', data=train_df)
plt.xlabel('Bedroom count', fontsize=12)
plt.ylabel('Log Error', fontsize=12)
plt.show()
In [33]:
train_df['bedroomcnt'].ix[train_df['bedroomcnt']>7] = 7
fig = ff.create_violin(train_df, data_header='logerror', group_header='bedroomcnt')

# Edit the layout
layout = dict(title='Log Error vs Bedroom count',
              xaxis = dict(title = 'Bedroom count'),
              yaxis = dict(title = 'Log Error'),
              font  = dict(size=16),
              autosize = False,
              width = 800,
              height = 500,
              bargap=0.2,
              )
iplot(fig, layout)
In [34]:
col = "taxamount"
ulimit = np.percentile(train_df[col].values, 99)
llimit = np.percentile(train_df[col].values, 1)
train_df[col].ix[train_df[col]>ulimit] = ulimit
train_df[col].ix[train_df[col]<llimit] = llimit

plt.figure(figsize=(12,12))
sns.jointplot(x=train_df['taxamount'].values, y=train_df['logerror'].values, size=10, color='g')
plt.ylabel('Log Error', fontsize=12)
plt.xlabel('Tax Amount', fontsize=12)
plt.title("Tax Amount Vs Log error", fontsize=15)
plt.show()
<matplotlib.figure.Figure at 0x44beebe0>
In [35]:
from ggplot import *
ggplot(aes(x='yearbuilt', y='logerror'), data=train_df) + \
    geom_point(color='steelblue', size=1) + \
    stat_smooth()

AttributeErrorTraceback (most recent call last)
C:\toolkits\Anaconda2-4.2.0\lib\site-packages\IPython\core\formatters.pyc in __call__(self, obj)
    670                 type_pprinters=self.type_printers,
    671                 deferred_pprinters=self.deferred_printers)
--> 672             printer.pretty(obj)
    673             printer.flush()
    674             return stream.getvalue()

C:\toolkits\Anaconda2-4.2.0\lib\site-packages\IPython\lib\pretty.pyc in pretty(self, obj)
    381                             if callable(meth):
    382                                 return meth(obj, self, cycle)
--> 383             return _default_pprint(obj, self, cycle)
    384         finally:
    385             self.end_group()

C:\toolkits\Anaconda2-4.2.0\lib\site-packages\IPython\lib\pretty.pyc in _default_pprint(obj, p, cycle)
    501     if _safe_getattr(klass, '__repr__', None) not in _baseclass_reprs:
    502         # A user-provided repr. Find newlines and replace them with p.break_()
--> 503         _repr_pprint(obj, p, cycle)
    504         return
    505     p.begin_group(1, '<')

C:\toolkits\Anaconda2-4.2.0\lib\site-packages\IPython\lib\pretty.pyc in _repr_pprint(obj, p, cycle)
    699     """A pprint that just redirects to the normal repr function."""
    700     # Find newlines and replace them with p.break_()
--> 701     output = repr(obj)
    702     for idx,output_line in enumerate(output.splitlines()):
    703         if idx:

C:\toolkits\Anaconda2-4.2.0\lib\site-packages\ggplot\ggplot.pyc in __repr__(self)
    114 
    115     def __repr__(self):
--> 116         self.make()
    117         # this is nice for dev but not the best for "real"
    118         if os.environ.get("GGPLOT_DEV"):

C:\toolkits\Anaconda2-4.2.0\lib\site-packages\ggplot\ggplot.pyc in make(self)
    634                         if kwargs==False:
    635                             continue
--> 636                         layer.plot(ax, facetgroup, self._aes, **kwargs)
    637 
    638             self.apply_limits()

C:\toolkits\Anaconda2-4.2.0\lib\site-packages\ggplot\stats\stat_smooth.pyc in plot(self, ax, data, _aes)
     75 
     76         smoothed_data = pd.DataFrame(dict(x=x, y=y, y1=y1, y2=y2))
---> 77         smoothed_data = smoothed_data.sort('x')
     78 
     79         params = self._get_plot_args(data, _aes)

C:\toolkits\Anaconda2-4.2.0\lib\site-packages\pandas\core\generic.pyc in __getattr__(self, name)
   3079             if name in self._info_axis:
   3080                 return self[name]
-> 3081             return object.__getattribute__(self, name)
   3082 
   3083     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'sort'
In [36]:
ggplot(aes(x='latitude', y='longitude', color='logerror'), data=train_df) + \
    geom_point() + \
    scale_color_gradient(low = 'red', high = 'blue')
Out[36]:
<ggplot: (15349112)>
In [37]:
ggplot(aes(x='finishedsquarefeet12', y='taxamount', color='logerror'), data=train_df) + \
    geom_point(alpha=0.7) + \
    scale_color_gradient(low = 'pink', high = 'blue')
Out[37]:
<ggplot: (15354312)>
In [38]:
import sklearn
from sklearn import ensemble, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier

train_y = train_df['logerror'].values
cat_cols = ["hashottuborspa", "propertycountylandusecode", "propertyzoningdesc", "fireplaceflag", "taxdelinquencyflag"]
train_df = train_df.drop(['parcelid', 'logerror', 'transactiondate', 'transaction_month']+cat_cols, axis=1)
feat_names = train_df.columns.values
In [39]:
model = ensemble.ExtraTreesRegressor(n_estimators=25, max_depth=30, max_features=0.3, n_jobs=-1, random_state=0)
model.fit(train_df, train_y)

## plot the importances ##
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1][:20]

plt.figure(figsize=(12,12))
plt.title("Feature importances")
plt.bar(range(len(indices)), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical')
plt.xlim([-1, len(indices)])
plt.show()
In [40]:
#SKlearn: Linear Regression & ExtraTreesRegressor
import numpy as np
import pandas as pd
import gc
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import GridSearchCV

start = time.time()

print('Loading data ...')
train  = pd.read_csv('train_2016_v2.csv')
prop   = pd.read_csv('properties_2016.csv')
sample = pd.read_csv('sample_submission.csv')


print('Binding to float32')
for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

        
print('Creating training set ...')
df_train = train.merge(prop, how='left', on='parcelid')

x_train = df_train.drop(['parcelid',
                         'logerror',
                         'transactiondate',
                         'propertyzoningdesc',
                         'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)

train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

print('deleting df_train ...')
del df_train; gc.collect()

split = 80000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

mean_values  = x_train.mean(axis=0)
x_train = x_train.fillna(mean_values, inplace=True)
mean_values  = x_valid.mean(axis=0)
x_valid = x_valid.fillna(mean_values, inplace=True)

model =  linear_model.LinearRegression()
model.fit(x_train, y_train)
print('Predicting on validation set ...')
pred  = model.predict(x_valid)
print('LinearRegression Model validation set MAE: '                 + str(mean_absolute_error(y_valid, pred)))
print()

model = linear_model.RANSACRegressor()
model.fit(x_train, y_train)
print('Predicting on validation set ...')
pred  = model.predict(x_valid)
print('RANSACRegressor Model validation set MAE: '                 + str(mean_absolute_error(y_valid, pred)))
print()

model = linear_model.TheilSenRegressor(random_state=42)
model.fit(x_train, y_train)
print('Predicting on validation set ...')
pred  = model.predict(x_valid)
print('TheilSenRegressor Model validation set MAE: '                 + str(mean_absolute_error(y_valid, pred)))
print()

model = linear_model.BayesianRidge(compute_score=True)
model.fit(x_train, y_train)
print('Predicting on validation set ...')
pred  = model.predict(x_valid)
print('BayesianRidge Model validation set MAE: '                 + str(mean_absolute_error(y_valid, pred)))
print()

model = ensemble.ExtraTreesRegressor(n_estimators=50, max_depth=30, max_features=0.3, random_state=777)
model.fit(x_train, y_train)
print('Predicting on validation set ...')
pred  = model.predict(x_valid)
print('ExtraTreesRegressor Model validation set MAE: '                 + str(mean_absolute_error(y_valid, pred)))
Loading data ...
Binding to float32
Creating training set ...
((90275, 55), (90275L,))
deleting df_train ...
Predicting on validation set ...
LinearRegression Model validation set MAE: 0.0672385235425
()
Predicting on validation set ...
RANSACRegressor Model validation set MAE: 0.29966305984
()
Predicting on validation set ...
TheilSenRegressor Model validation set MAE: 0.1421569654
()
Predicting on validation set ...
BayesianRidge Model validation set MAE: 0.0667174886317
()
Predicting on validation set ...
ExtraTreesRegressor Model validation set MAE: 0.0761310083178
In [41]:
xtrain, xtest, ytrain, ytest = train_test_split(train_df, train_y,
                                                test_size=0.11,
                                                random_state=2017)



model =  linear_model.LinearRegression()
model.fit(xtrain, ytrain)
pred = model.predict(xtest)
mean_absolute_error(ytest, pred)
Out[41]:
0.060771101149024634
In [42]:
xtrain, xtest, ytrain, ytest = train_test_split(train_df, train_y,
                                                test_size=0.11,
                                                random_state=2017)



model = ensemble.ExtraTreesRegressor(n_estimators=100, max_depth=30, max_features=0.3, n_jobs=-1, random_state=777)
model.fit(xtrain, ytrain)
pred = model.predict(xtest)
mean_absolute_error(ytest, pred)
Out[42]:
0.061565905291596092
In [43]:
import numpy as np
import pandas as pd
import gc
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import GridSearchCV

start = time.time()

print('Loading data ...')
train  = pd.read_csv('train_2016_v2.csv')
prop   = pd.read_csv('properties_2016.csv')
sample = pd.read_csv('sample_submission.csv')


print('Binding to float32')
for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

        
print('Creating training set ...')
df_train = train.merge(prop, how='left', on='parcelid')

x_train = df_train.drop(['parcelid',
                         'logerror',
                         'transactiondate',
                         'propertyzoningdesc',
                         'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)

train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

c1 = df_train['logerror'].mean()
c2 = df_train['logerror'].median()
print('Logerror mean: ' + str(c1))
print('Logerror median: ' + str(c2))
print('deleting df_train ...')
del df_train; gc.collect()

split = 80000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]


mean_values  = x_train.mean(axis=0)
x_train = x_train.fillna(mean_values, inplace=True)

#model = ensemble.ExtraTreesRegressor(n_estimators=100, max_depth=30, max_features=0.3, n_jobs=-1, random_state=777)

parameters = {'n_estimators':[50, 75, 100], 'max_depth':[10, 30, 50], 'max_features':[0.1, 0.3, 0.5]}
extratree = ensemble.ExtraTreesRegressor() #criterion='mae'
model = GridSearchCV(extratree, parameters,verbose=2,scoring=make_scorer(mean_absolute_error))
model.fit(x_train, y_train)
Loading data ...
Binding to float32
Creating training set ...
((90275, 55), (90275L,))
Logerror mean: 0.0114572196068
Logerror median: 0.006
deleting df_train ...
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] max_features=0.1, n_estimators=50, max_depth=10 .................
[CV] .. max_features=0.1, n_estimators=50, max_depth=10, total=   0.9s
[CV] max_features=0.1, n_estimators=50, max_depth=10 .................
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s
[CV] .. max_features=0.1, n_estimators=50, max_depth=10, total=   0.8s
[CV] max_features=0.1, n_estimators=50, max_depth=10 .................
[CV] .. max_features=0.1, n_estimators=50, max_depth=10, total=   0.9s
[CV] max_features=0.1, n_estimators=75, max_depth=10 .................
[CV] .. max_features=0.1, n_estimators=75, max_depth=10, total=   1.3s
[CV] max_features=0.1, n_estimators=75, max_depth=10 .................
[CV] .. max_features=0.1, n_estimators=75, max_depth=10, total=   1.3s
[CV] max_features=0.1, n_estimators=75, max_depth=10 .................
[CV] .. max_features=0.1, n_estimators=75, max_depth=10, total=   1.3s
[CV] max_features=0.1, n_estimators=100, max_depth=10 ................
[CV] . max_features=0.1, n_estimators=100, max_depth=10, total=   1.7s
[CV] max_features=0.1, n_estimators=100, max_depth=10 ................
[CV] . max_features=0.1, n_estimators=100, max_depth=10, total=   1.5s
[CV] max_features=0.1, n_estimators=100, max_depth=10 ................
[CV] . max_features=0.1, n_estimators=100, max_depth=10, total=   1.6s
[CV] max_features=0.3, n_estimators=50, max_depth=10 .................
[CV] .. max_features=0.3, n_estimators=50, max_depth=10, total=   1.7s
[CV] max_features=0.3, n_estimators=50, max_depth=10 .................
[CV] .. max_features=0.3, n_estimators=50, max_depth=10, total=   1.8s
[CV] max_features=0.3, n_estimators=50, max_depth=10 .................
[CV] .. max_features=0.3, n_estimators=50, max_depth=10, total=   1.7s
[CV] max_features=0.3, n_estimators=75, max_depth=10 .................
[CV] .. max_features=0.3, n_estimators=75, max_depth=10, total=   2.5s
[CV] max_features=0.3, n_estimators=75, max_depth=10 .................
[CV] .. max_features=0.3, n_estimators=75, max_depth=10, total=   2.5s
[CV] max_features=0.3, n_estimators=75, max_depth=10 .................
[CV] .. max_features=0.3, n_estimators=75, max_depth=10, total=   2.5s
[CV] max_features=0.3, n_estimators=100, max_depth=10 ................
[CV] . max_features=0.3, n_estimators=100, max_depth=10, total=   3.4s
[CV] max_features=0.3, n_estimators=100, max_depth=10 ................
[CV] . max_features=0.3, n_estimators=100, max_depth=10, total=   3.4s
[CV] max_features=0.3, n_estimators=100, max_depth=10 ................
[CV] . max_features=0.3, n_estimators=100, max_depth=10, total=   3.4s
[CV] max_features=0.5, n_estimators=50, max_depth=10 .................
[CV] .. max_features=0.5, n_estimators=50, max_depth=10, total=   2.6s
[CV] max_features=0.5, n_estimators=50, max_depth=10 .................
[CV] .. max_features=0.5, n_estimators=50, max_depth=10, total=   2.6s
[CV] max_features=0.5, n_estimators=50, max_depth=10 .................
[CV] .. max_features=0.5, n_estimators=50, max_depth=10, total=   2.6s
[CV] max_features=0.5, n_estimators=75, max_depth=10 .................
[CV] .. max_features=0.5, n_estimators=75, max_depth=10, total=   4.0s
[CV] max_features=0.5, n_estimators=75, max_depth=10 .................
[CV] .. max_features=0.5, n_estimators=75, max_depth=10, total=   4.0s
[CV] max_features=0.5, n_estimators=75, max_depth=10 .................
[CV] .. max_features=0.5, n_estimators=75, max_depth=10, total=   4.2s
[CV] max_features=0.5, n_estimators=100, max_depth=10 ................
[CV] . max_features=0.5, n_estimators=100, max_depth=10, total=   5.2s
[CV] max_features=0.5, n_estimators=100, max_depth=10 ................
[CV] . max_features=0.5, n_estimators=100, max_depth=10, total=   5.4s
[CV] max_features=0.5, n_estimators=100, max_depth=10 ................
[CV] . max_features=0.5, n_estimators=100, max_depth=10, total=   5.4s
[CV] max_features=0.1, n_estimators=50, max_depth=30 .................
[CV] .. max_features=0.1, n_estimators=50, max_depth=30, total=   3.3s
[CV] max_features=0.1, n_estimators=50, max_depth=30 .................
[CV] .. max_features=0.1, n_estimators=50, max_depth=30, total=   3.3s
[CV] max_features=0.1, n_estimators=50, max_depth=30 .................
[CV] .. max_features=0.1, n_estimators=50, max_depth=30, total=   3.3s
[CV] max_features=0.1, n_estimators=75, max_depth=30 .................
[CV] .. max_features=0.1, n_estimators=75, max_depth=30, total=   4.9s
[CV] max_features=0.1, n_estimators=75, max_depth=30 .................
[CV] .. max_features=0.1, n_estimators=75, max_depth=30, total=   4.8s
[CV] max_features=0.1, n_estimators=75, max_depth=30 .................
[CV] .. max_features=0.1, n_estimators=75, max_depth=30, total=   4.9s
[CV] max_features=0.1, n_estimators=100, max_depth=30 ................
[CV] . max_features=0.1, n_estimators=100, max_depth=30, total=   6.7s
[CV] max_features=0.1, n_estimators=100, max_depth=30 ................
[CV] . max_features=0.1, n_estimators=100, max_depth=30, total=   6.4s
[CV] max_features=0.1, n_estimators=100, max_depth=30 ................
[CV] . max_features=0.1, n_estimators=100, max_depth=30, total=   6.4s
[CV] max_features=0.3, n_estimators=50, max_depth=30 .................
[CV] .. max_features=0.3, n_estimators=50, max_depth=30, total=   6.6s
[CV] max_features=0.3, n_estimators=50, max_depth=30 .................
[CV] .. max_features=0.3, n_estimators=50, max_depth=30, total=   6.4s
[CV] max_features=0.3, n_estimators=50, max_depth=30 .................
[CV] .. max_features=0.3, n_estimators=50, max_depth=30, total=   6.3s
[CV] max_features=0.3, n_estimators=75, max_depth=30 .................
[CV] .. max_features=0.3, n_estimators=75, max_depth=30, total=   9.7s
[CV] max_features=0.3, n_estimators=75, max_depth=30 .................
[CV] .. max_features=0.3, n_estimators=75, max_depth=30, total=   9.9s
[CV] max_features=0.3, n_estimators=75, max_depth=30 .................
[CV] .. max_features=0.3, n_estimators=75, max_depth=30, total=   9.7s
[CV] max_features=0.3, n_estimators=100, max_depth=30 ................
[CV] . max_features=0.3, n_estimators=100, max_depth=30, total=  12.9s
[CV] max_features=0.3, n_estimators=100, max_depth=30 ................
[CV] . max_features=0.3, n_estimators=100, max_depth=30, total=  12.8s
[CV] max_features=0.3, n_estimators=100, max_depth=30 ................
[CV] . max_features=0.3, n_estimators=100, max_depth=30, total=  13.0s
[CV] max_features=0.5, n_estimators=50, max_depth=30 .................
[CV] .. max_features=0.5, n_estimators=50, max_depth=30, total=   9.8s
[CV] max_features=0.5, n_estimators=50, max_depth=30 .................
[CV] .. max_features=0.5, n_estimators=50, max_depth=30, total=  10.5s
[CV] max_features=0.5, n_estimators=50, max_depth=30 .................
[CV] .. max_features=0.5, n_estimators=50, max_depth=30, total=   9.5s
[CV] max_features=0.5, n_estimators=75, max_depth=30 .................
[CV] .. max_features=0.5, n_estimators=75, max_depth=30, total=  15.0s
[CV] max_features=0.5, n_estimators=75, max_depth=30 .................
[CV] .. max_features=0.5, n_estimators=75, max_depth=30, total=  14.8s
[CV] max_features=0.5, n_estimators=75, max_depth=30 .................
[CV] .. max_features=0.5, n_estimators=75, max_depth=30, total=  14.5s
[CV] max_features=0.5, n_estimators=100, max_depth=30 ................
[CV] . max_features=0.5, n_estimators=100, max_depth=30, total=  18.8s
[CV] max_features=0.5, n_estimators=100, max_depth=30 ................
[CV] . max_features=0.5, n_estimators=100, max_depth=30, total=  19.8s
[CV] max_features=0.5, n_estimators=100, max_depth=30 ................
[CV] . max_features=0.5, n_estimators=100, max_depth=30, total=  20.1s
[CV] max_features=0.1, n_estimators=50, max_depth=50 .................
[CV] .. max_features=0.1, n_estimators=50, max_depth=50, total=   5.5s
[CV] max_features=0.1, n_estimators=50, max_depth=50 .................
[CV] .. max_features=0.1, n_estimators=50, max_depth=50, total=   5.5s
[CV] max_features=0.1, n_estimators=50, max_depth=50 .................
[CV] .. max_features=0.1, n_estimators=50, max_depth=50, total=   6.0s
[CV] max_features=0.1, n_estimators=75, max_depth=50 .................
[CV] .. max_features=0.1, n_estimators=75, max_depth=50, total=   8.0s
[CV] max_features=0.1, n_estimators=75, max_depth=50 .................
[CV] .. max_features=0.1, n_estimators=75, max_depth=50, total=   8.5s
[CV] max_features=0.1, n_estimators=75, max_depth=50 .................
[CV] .. max_features=0.1, n_estimators=75, max_depth=50, total=   8.1s
[CV] max_features=0.1, n_estimators=100, max_depth=50 ................
[CV] . max_features=0.1, n_estimators=100, max_depth=50, total=  11.2s
[CV] max_features=0.1, n_estimators=100, max_depth=50 ................
[CV] . max_features=0.1, n_estimators=100, max_depth=50, total=  10.8s
[CV] max_features=0.1, n_estimators=100, max_depth=50 ................
[CV] . max_features=0.1, n_estimators=100, max_depth=50, total=  10.9s
[CV] max_features=0.3, n_estimators=50, max_depth=50 .................
[CV] .. max_features=0.3, n_estimators=50, max_depth=50, total=  10.0s
[CV] max_features=0.3, n_estimators=50, max_depth=50 .................
[CV] .. max_features=0.3, n_estimators=50, max_depth=50, total=   9.7s
[CV] max_features=0.3, n_estimators=50, max_depth=50 .................
[CV] .. max_features=0.3, n_estimators=50, max_depth=50, total=   9.5s
[CV] max_features=0.3, n_estimators=75, max_depth=50 .................
[CV] .. max_features=0.3, n_estimators=75, max_depth=50, total=  14.4s
[CV] max_features=0.3, n_estimators=75, max_depth=50 .................
[CV] .. max_features=0.3, n_estimators=75, max_depth=50, total=  14.6s
[CV] max_features=0.3, n_estimators=75, max_depth=50 .................
[CV] .. max_features=0.3, n_estimators=75, max_depth=50, total=  14.6s
[CV] max_features=0.3, n_estimators=100, max_depth=50 ................
[CV] . max_features=0.3, n_estimators=100, max_depth=50, total=  18.9s
[CV] max_features=0.3, n_estimators=100, max_depth=50 ................
[CV] . max_features=0.3, n_estimators=100, max_depth=50, total=  20.2s
[CV] max_features=0.3, n_estimators=100, max_depth=50 ................
[CV] . max_features=0.3, n_estimators=100, max_depth=50, total=  19.8s
[CV] max_features=0.5, n_estimators=50, max_depth=50 .................
[CV] .. max_features=0.5, n_estimators=50, max_depth=50, total=  14.6s
[CV] max_features=0.5, n_estimators=50, max_depth=50 .................
[CV] .. max_features=0.5, n_estimators=50, max_depth=50, total=  14.1s
[CV] max_features=0.5, n_estimators=50, max_depth=50 .................
[CV] .. max_features=0.5, n_estimators=50, max_depth=50, total=  14.0s
[CV] max_features=0.5, n_estimators=75, max_depth=50 .................
[CV] .. max_features=0.5, n_estimators=75, max_depth=50, total=  22.6s
[CV] max_features=0.5, n_estimators=75, max_depth=50 .................
[CV] .. max_features=0.5, n_estimators=75, max_depth=50, total=  21.2s
[CV] max_features=0.5, n_estimators=75, max_depth=50 .................
[CV] .. max_features=0.5, n_estimators=75, max_depth=50, total=  21.1s
[CV] max_features=0.5, n_estimators=100, max_depth=50 ................
[CV] . max_features=0.5, n_estimators=100, max_depth=50, total=  29.5s
[CV] max_features=0.5, n_estimators=100, max_depth=50 ................
[CV] . max_features=0.5, n_estimators=100, max_depth=50, total=  29.3s
[CV] max_features=0.5, n_estimators=100, max_depth=50 ................
[CV] . max_features=0.5, n_estimators=100, max_depth=50, total=  27.8s
[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed: 14.0min finished
Out[43]:
GridSearchCV(cv=None, error_score='raise',
       estimator=ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
          oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [50, 75, 100], 'max_features': [0.1, 0.3, 0.5], 'max_depth': [10, 30, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(mean_absolute_error), verbose=2)
In [44]:
#model.cv_results_
model.best_params_
Out[44]:
{'max_depth': 50, 'max_features': 0.5, 'n_estimators': 50}
In [45]:
mean_values  = x_valid.mean(axis=0)
x_valid = x_valid.fillna(mean_values, inplace=True)

dummy1 = DummyClassifier(constant=c1,random_state=0)
dummy2 = DummyClassifier(constant=c2,random_state=0)

dummy1.fit(x_train, y_train)
dummy2.fit(x_train, y_train)

print('Predicting on validation set ...')
pred  = model.predict(x_valid)
pred1 = dummy1.predict(x_valid)
pred2 = dummy2.predict(x_valid)
print('Model validation set MAE: '                 + str(mean_absolute_error(y_valid, pred)))
print('Mean Dummy regressor validation set MAE: '  + str(mean_absolute_error(y_valid, pred1)))
print('Median Dummy regressor validation set MAE: '+ str(mean_absolute_error(y_valid, pred2)))


print('Building test set ...')
sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')


print('x_test ...')
x_test = df_test[train_columns]
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)
    
    
print('Fill NA ...')
mean_values  = x_test.mean(axis=0)
x_test = x_test.fillna(mean_values, inplace=True)



print('Predicting on test ...')
p_test = model.predict(x_test)
#p_test = 0.97*p_test + 0.03*0.011

print('deleting x_test ...')

del x_test; gc.collect()


sub = pd.read_csv('sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test

end = time.time()
print('time taken: ' + str(end - start))

print('Writing csv ...')
sub.to_csv('ExtraTrees_starter.csv', index=False, float_format='%.4f') 
Predicting on validation set ...
Model validation set MAE: 0.089835356656
Mean Dummy regressor validation set MAE: 0.112239231144
Median Dummy regressor validation set MAE: 0.112239231144
Building test set ...
x_test ...
Fill NA ...
Predicting on test ...
deleting x_test ...
time taken: 977.869999886
Writing csv ...
In [46]:
import xgboost as xgb
print(xgb.__version__)
xgb_params = {
    'eta': 0.05,
    'max_depth': 8,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'silent': 1,
    'seed' : 0
}
dtrain = xgb.DMatrix(train_df, train_y, feature_names=train_df.columns.values)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=50)

# plot the important features #
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
plt.show()
0.6
In [47]:
import numpy as np
import pandas as pd
import xgboost as xgb
import gc

print('Loading data ...')
train  = pd.read_csv('train_2016_v2.csv')
prop   = pd.read_csv('properties_2016.csv')
sample = pd.read_csv('sample_submission.csv')

print('Binding to float32')

for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

print('Creating training set ...')

df_train = train.merge(prop, how='left', on='parcelid')

x_train = df_train.drop(['parcelid',
                         'logerror',
                         'transactiondate',
                         'propertyzoningdesc',
                         'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)

train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

del df_train; gc.collect()

split = 80000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

print('Building DMatrix...')

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

del x_train; gc.collect() #, x_valid

print('Training ...')

params = {
    'eta'         : 0.02,
    'objective'   : 'reg:linear',
    'eval_metric' : 'mae',
    'max_depth'   : 4,
    'silent'      : 1,
}

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params,
                d_train,
                10000,
                watchlist,
                early_stopping_rounds=100,
                verbose_eval=10)

del d_train #, d_valid

print('Building test set ...')

sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')

del prop; gc.collect()

x_test = df_test[train_columns]
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)

del df_test, sample; gc.collect()

d_test = xgb.DMatrix(x_test)

del x_test; gc.collect()


print('Predicting on validation set ...')
pred  = clf.predict(d_valid)
pred1 = dummy1.predict(x_valid)
pred2 = dummy2.predict(x_valid)
print('Validation set MAE:                       '+ str(mean_absolute_error(y_valid, pred)))
print('Mean Dummy regressor validation set MAE: ' + str(mean_absolute_error(y_valid, pred1)))
print('Median Dummy regresor validation set MAE: '+ str(mean_absolute_error(y_valid, pred2)))


print('Predicting on test ...')

p_test = clf.predict(d_test)
p_test = 0.97*p_test + 0.03*0.011

del d_test; gc.collect()

sub = pd.read_csv('sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test

print('Writing csv ...')
sub.to_csv('xgb_starter.csv', index=False, float_format='%.4f') 
Loading data ...
Binding to float32
Creating training set ...
((90275, 55), (90275L,))
Building DMatrix...
Training ...
[0]	train-mae:0.488065	valid-mae:0.48112
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 100 rounds.
[10]	train-mae:0.402221	valid-mae:0.395444
[20]	train-mae:0.33268	valid-mae:0.326098
[30]	train-mae:0.276518	valid-mae:0.270132
[40]	train-mae:0.231316	valid-mae:0.225213
[50]	train-mae:0.195059	valid-mae:0.189317
[60]	train-mae:0.166121	valid-mae:0.16072
[70]	train-mae:0.143115	valid-mae:0.138042
[80]	train-mae:0.124973	valid-mae:0.120213
[90]	train-mae:0.11079	valid-mae:0.106351
[100]	train-mae:0.099822	valid-mae:0.095702
[110]	train-mae:0.091454	valid-mae:0.087592
[120]	train-mae:0.085149	valid-mae:0.08158
[130]	train-mae:0.080456	valid-mae:0.077192
[140]	train-mae:0.077015	valid-mae:0.074063
[150]	train-mae:0.07451	valid-mae:0.071827
[160]	train-mae:0.072688	valid-mae:0.070245
[170]	train-mae:0.071374	valid-mae:0.069129
[180]	train-mae:0.070415	valid-mae:0.068366
[190]	train-mae:0.069715	valid-mae:0.067854
[200]	train-mae:0.069209	valid-mae:0.067512
[210]	train-mae:0.068828	valid-mae:0.06727
[220]	train-mae:0.068547	valid-mae:0.067113
[230]	train-mae:0.068334	valid-mae:0.067006
[240]	train-mae:0.068171	valid-mae:0.066937
[250]	train-mae:0.068039	valid-mae:0.066892
[260]	train-mae:0.067937	valid-mae:0.066866
[270]	train-mae:0.067856	valid-mae:0.066848
[280]	train-mae:0.067789	valid-mae:0.06684
[290]	train-mae:0.067735	valid-mae:0.066836
[300]	train-mae:0.067691	valid-mae:0.066838
[310]	train-mae:0.067655	valid-mae:0.066842
[320]	train-mae:0.067618	valid-mae:0.066841
[330]	train-mae:0.067586	valid-mae:0.066845
[340]	train-mae:0.067563	valid-mae:0.06686
[350]	train-mae:0.06754	valid-mae:0.066865
[360]	train-mae:0.067518	valid-mae:0.066872
[370]	train-mae:0.0675	valid-mae:0.066875
[380]	train-mae:0.067482	valid-mae:0.066877
Stopping. Best iteration:
[288]	train-mae:0.067745	valid-mae:0.066835

Building test set ...
Predicting on validation set ...
Validation set MAE:                       0.0668864560513
Mean Dummy regressor validation set MAE: 0.112239231144
Median Dummy regresor validation set MAE: 0.112239231144
Predicting on test ...
Writing csv ...
In [48]:
# Any results you write to the current directory are saved as output.
import numpy as np
import pandas as pd
import lightgbm as lgb
import gc

print('Loading data ...')
train = pd.read_csv('train_2016_v2.csv')
prop  = pd.read_csv('properties_2016.csv')

for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

df_train = train.merge(prop, how='left', on='parcelid')

x_train = df_train.drop(['parcelid',
                         'logerror',
                         'transactiondate',
                         'propertyzoningdesc',
                         'propertycountylandusecode'], axis=1)

y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)

train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

del df_train; gc.collect()

split = 80000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]
x_train = x_train.values.astype(np.float32, copy=False)
x_valid = x_valid.values.astype(np.float32, copy=False)

d_train = lgb.Dataset(x_train, label=y_train)
d_valid = lgb.Dataset(x_valid, label=y_valid)

params = {
    'max_bin'         : 20,
    'learning_rate'   : 0.0021,        # shrinkage_rate
    'boosting_type'   : 'gbdt',
    'objective'       : 'regression',
    'metric'          : 'l1',          # or 'mae'
    'sub_feature'     : 0.5,           # feature_fraction
    'bagging_fraction': 0.85,          # sub_row
    'bagging_freq'    : 40,
    'num_leaves'      : 512,           # num_leaf
    'min_data'        : 500,           # min_data_in_leaf
    'min_hessian'     : 0.05,          # min_sum_hessian_in_leaf
          }


watchlist = [d_valid]
clf       = lgb.train(params, d_train, 500, watchlist)

del d_train, d_valid; gc.collect()
del x_train; gc.collect()  #, x_valid

print("Prepare for the prediction ...")
sample = pd.read_csv('sample_submission.csv')
sample['parcelid'] = sample['ParcelId']

df_test = sample.merge(prop, on='parcelid', how='left')
del sample, prop; gc.collect()

x_test = df_test[train_columns]
del df_test; gc.collect()

for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)
x_test = x_test.values.astype(np.float32, copy=False)

print('Predicting on validation set ...')
pred = clf.predict(x_valid)
print('Validation set MAE: '+ str(mean_absolute_error(y_valid, pred)))

print("Start prediction ...")
# num_threads > 1 will predict very slow in kernel
clf.reset_parameter({"num_threads":1})
p_test = clf.predict(x_test)
p_test = 0.97*p_test + 0.03*0.011

del x_test; gc.collect()

print("Start write result ...")
sub = pd.read_csv('sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test

sub.to_csv('lgb_starter_1.csv', index=False, float_format='%.4f')
Loading data ...
((90275, 55), (90275L,))
[1]	valid_0's l1: 0.0665479
[2]	valid_0's l1: 0.0665453
[3]	valid_0's l1: 0.0665439
[4]	valid_0's l1: 0.0665406
[5]	valid_0's l1: 0.0665382
[6]	valid_0's l1: 0.0665355
[7]	valid_0's l1: 0.0665331
[8]	valid_0's l1: 0.0665308
[9]	valid_0's l1: 0.0665281
[10]	valid_0's l1: 0.0665264
[11]	valid_0's l1: 0.0665229
[12]	valid_0's l1: 0.0665212
[13]	valid_0's l1: 0.0665184
[14]	valid_0's l1: 0.0665169
[15]	valid_0's l1: 0.0665128
[16]	valid_0's l1: 0.0665109
[17]	valid_0's l1: 0.0665092
[18]	valid_0's l1: 0.0665063
[19]	valid_0's l1: 0.0665036
[20]	valid_0's l1: 0.0665005
[21]	valid_0's l1: 0.0664983
[22]	valid_0's l1: 0.0664959
[23]	valid_0's l1: 0.0664942
[24]	valid_0's l1: 0.0664916
[25]	valid_0's l1: 0.0664903
[26]	valid_0's l1: 0.0664878
[27]	valid_0's l1: 0.0664855
[28]	valid_0's l1: 0.0664825
[29]	valid_0's l1: 0.0664792
[30]	valid_0's l1: 0.0664775
[31]	valid_0's l1: 0.0664753
[32]	valid_0's l1: 0.0664723
[33]	valid_0's l1: 0.0664703
[34]	valid_0's l1: 0.0664673
[35]	valid_0's l1: 0.0664646
[36]	valid_0's l1: 0.0664636
[37]	valid_0's l1: 0.0664606
[38]	valid_0's l1: 0.0664581
[39]	valid_0's l1: 0.0664556
[40]	valid_0's l1: 0.066453
[41]	valid_0's l1: 0.0664502
[42]	valid_0's l1: 0.0664479
[43]	valid_0's l1: 0.0664455
[44]	valid_0's l1: 0.0664437
[45]	valid_0's l1: 0.0664408
[46]	valid_0's l1: 0.0664389
[47]	valid_0's l1: 0.0664365
[48]	valid_0's l1: 0.0664351
[49]	valid_0's l1: 0.0664328
[50]	valid_0's l1: 0.0664302
[51]	valid_0's l1: 0.0664275
[52]	valid_0's l1: 0.0664251
[53]	valid_0's l1: 0.0664225
[54]	valid_0's l1: 0.0664194
[55]	valid_0's l1: 0.0664172
[56]	valid_0's l1: 0.0664153
[57]	valid_0's l1: 0.0664134
[58]	valid_0's l1: 0.0664117
[59]	valid_0's l1: 0.0664099
[60]	valid_0's l1: 0.0664076
[61]	valid_0's l1: 0.066406
[62]	valid_0's l1: 0.0664042
[63]	valid_0's l1: 0.0664026
[64]	valid_0's l1: 0.0664011
[65]	valid_0's l1: 0.0663988
[66]	valid_0's l1: 0.0663963
[67]	valid_0's l1: 0.0663939
[68]	valid_0's l1: 0.0663922
[69]	valid_0's l1: 0.0663901
[70]	valid_0's l1: 0.0663873
[71]	valid_0's l1: 0.0663857
[72]	valid_0's l1: 0.0663832
[73]	valid_0's l1: 0.0663815
[74]	valid_0's l1: 0.0663803
[75]	valid_0's l1: 0.0663788
[76]	valid_0's l1: 0.0663773
[77]	valid_0's l1: 0.0663752
[78]	valid_0's l1: 0.0663733
[79]	valid_0's l1: 0.0663713
[80]	valid_0's l1: 0.0663689
[81]	valid_0's l1: 0.0663665
[82]	valid_0's l1: 0.0663648
[83]	valid_0's l1: 0.0663637
[84]	valid_0's l1: 0.0663616
[85]	valid_0's l1: 0.06636
[86]	valid_0's l1: 0.0663574
[87]	valid_0's l1: 0.0663551
[88]	valid_0's l1: 0.0663537
[89]	valid_0's l1: 0.0663521
[90]	valid_0's l1: 0.0663499
[91]	valid_0's l1: 0.0663479
[92]	valid_0's l1: 0.0663454
[93]	valid_0's l1: 0.0663436
[94]	valid_0's l1: 0.066342
[95]	valid_0's l1: 0.0663397
[96]	valid_0's l1: 0.0663375
[97]	valid_0's l1: 0.0663351
[98]	valid_0's l1: 0.0663332
[99]	valid_0's l1: 0.0663309
[100]	valid_0's l1: 0.0663294
[101]	valid_0's l1: 0.0663279
[102]	valid_0's l1: 0.066327
[103]	valid_0's l1: 0.0663259
[104]	valid_0's l1: 0.0663246
[105]	valid_0's l1: 0.0663233
[106]	valid_0's l1: 0.0663216
[107]	valid_0's l1: 0.0663198
[108]	valid_0's l1: 0.0663175
[109]	valid_0's l1: 0.0663162
[110]	valid_0's l1: 0.066315
[111]	valid_0's l1: 0.0663127
[112]	valid_0's l1: 0.0663114
[113]	valid_0's l1: 0.0663098
[114]	valid_0's l1: 0.0663076
[115]	valid_0's l1: 0.066306
[116]	valid_0's l1: 0.0663047
[117]	valid_0's l1: 0.0663029
[118]	valid_0's l1: 0.0663011
[119]	valid_0's l1: 0.0662994
[120]	valid_0's l1: 0.0662971
[121]	valid_0's l1: 0.0662958
[122]	valid_0's l1: 0.0662947
[123]	valid_0's l1: 0.0662927
[124]	valid_0's l1: 0.0662916
[125]	valid_0's l1: 0.06629
[126]	valid_0's l1: 0.0662887
[127]	valid_0's l1: 0.0662871
[128]	valid_0's l1: 0.0662849
[129]	valid_0's l1: 0.066283
[130]	valid_0's l1: 0.0662806
[131]	valid_0's l1: 0.066279
[132]	valid_0's l1: 0.0662773
[133]	valid_0's l1: 0.066275
[134]	valid_0's l1: 0.0662735
[135]	valid_0's l1: 0.066272
[136]	valid_0's l1: 0.0662703
[137]	valid_0's l1: 0.0662686
[138]	valid_0's l1: 0.0662677
[139]	valid_0's l1: 0.0662669
[140]	valid_0's l1: 0.0662651
[141]	valid_0's l1: 0.0662632
[142]	valid_0's l1: 0.0662619
[143]	valid_0's l1: 0.0662603
[144]	valid_0's l1: 0.0662586
[145]	valid_0's l1: 0.0662571
[146]	valid_0's l1: 0.0662553
[147]	valid_0's l1: 0.0662542
[148]	valid_0's l1: 0.066253
[149]	valid_0's l1: 0.0662519
[150]	valid_0's l1: 0.0662504
[151]	valid_0's l1: 0.0662493
[152]	valid_0's l1: 0.0662477
[153]	valid_0's l1: 0.0662467
[154]	valid_0's l1: 0.0662457
[155]	valid_0's l1: 0.0662445
[156]	valid_0's l1: 0.0662423
[157]	valid_0's l1: 0.066241
[158]	valid_0's l1: 0.0662396
[159]	valid_0's l1: 0.0662384
[160]	valid_0's l1: 0.0662372
[161]	valid_0's l1: 0.0662361
[162]	valid_0's l1: 0.0662349
[163]	valid_0's l1: 0.0662339
[164]	valid_0's l1: 0.0662328
[165]	valid_0's l1: 0.0662318
[166]	valid_0's l1: 0.0662305
[167]	valid_0's l1: 0.0662301
[168]	valid_0's l1: 0.066229
[169]	valid_0's l1: 0.0662268
[170]	valid_0's l1: 0.0662254
[171]	valid_0's l1: 0.0662244
[172]	valid_0's l1: 0.066223
[173]	valid_0's l1: 0.0662219
[174]	valid_0's l1: 0.0662205
[175]	valid_0's l1: 0.0662194
[176]	valid_0's l1: 0.0662184
[177]	valid_0's l1: 0.0662171
[178]	valid_0's l1: 0.0662158
[179]	valid_0's l1: 0.0662142
[180]	valid_0's l1: 0.0662134
[181]	valid_0's l1: 0.0662119
[182]	valid_0's l1: 0.0662117
[183]	valid_0's l1: 0.06621
[184]	valid_0's l1: 0.0662086
[185]	valid_0's l1: 0.0662075
[186]	valid_0's l1: 0.0662059
[187]	valid_0's l1: 0.0662045
[188]	valid_0's l1: 0.0662032
[189]	valid_0's l1: 0.0662017
[190]	valid_0's l1: 0.0662007
[191]	valid_0's l1: 0.0661999
[192]	valid_0's l1: 0.0661987
[193]	valid_0's l1: 0.0661981
[194]	valid_0's l1: 0.066197
[195]	valid_0's l1: 0.0661962
[196]	valid_0's l1: 0.066195
[197]	valid_0's l1: 0.0661943
[198]	valid_0's l1: 0.0661934
[199]	valid_0's l1: 0.0661924
[200]	valid_0's l1: 0.0661915
[201]	valid_0's l1: 0.066191
[202]	valid_0's l1: 0.06619
[203]	valid_0's l1: 0.0661893
[204]	valid_0's l1: 0.0661889
[205]	valid_0's l1: 0.0661884
[206]	valid_0's l1: 0.0661873
[207]	valid_0's l1: 0.0661875
[208]	valid_0's l1: 0.0661867
[209]	valid_0's l1: 0.0661867
[210]	valid_0's l1: 0.0661859
[211]	valid_0's l1: 0.0661849
[212]	valid_0's l1: 0.0661837
[213]	valid_0's l1: 0.0661835
[214]	valid_0's l1: 0.0661823
[215]	valid_0's l1: 0.066181
[216]	valid_0's l1: 0.0661799
[217]	valid_0's l1: 0.0661798
[218]	valid_0's l1: 0.0661797
[219]	valid_0's l1: 0.0661784
[220]	valid_0's l1: 0.0661774
[221]	valid_0's l1: 0.0661766
[222]	valid_0's l1: 0.0661758
[223]	valid_0's l1: 0.0661745
[224]	valid_0's l1: 0.0661735
[225]	valid_0's l1: 0.0661724
[226]	valid_0's l1: 0.0661724
[227]	valid_0's l1: 0.0661719
[228]	valid_0's l1: 0.0661717
[229]	valid_0's l1: 0.0661706
[230]	valid_0's l1: 0.0661696
[231]	valid_0's l1: 0.0661694
[232]	valid_0's l1: 0.0661682
[233]	valid_0's l1: 0.066167
[234]	valid_0's l1: 0.0661663
[235]	valid_0's l1: 0.0661661
[236]	valid_0's l1: 0.0661652
[237]	valid_0's l1: 0.0661644
[238]	valid_0's l1: 0.0661633
[239]	valid_0's l1: 0.0661623
[240]	valid_0's l1: 0.0661612
[241]	valid_0's l1: 0.0661603
[242]	valid_0's l1: 0.0661597
[243]	valid_0's l1: 0.0661598
[244]	valid_0's l1: 0.066159
[245]	valid_0's l1: 0.0661589
[246]	valid_0's l1: 0.066158
[247]	valid_0's l1: 0.0661575
[248]	valid_0's l1: 0.0661573
[249]	valid_0's l1: 0.0661571
[250]	valid_0's l1: 0.0661564
[251]	valid_0's l1: 0.0661558
[252]	valid_0's l1: 0.0661552
[253]	valid_0's l1: 0.0661548
[254]	valid_0's l1: 0.0661543
[255]	valid_0's l1: 0.0661533
[256]	valid_0's l1: 0.0661523
[257]	valid_0's l1: 0.0661517
[258]	valid_0's l1: 0.066152
[259]	valid_0's l1: 0.0661516
[260]	valid_0's l1: 0.0661514
[261]	valid_0's l1: 0.0661512
[262]	valid_0's l1: 0.0661514
[263]	valid_0's l1: 0.0661512
[264]	valid_0's l1: 0.0661509
[265]	valid_0's l1: 0.0661507
[266]	valid_0's l1: 0.0661502
[267]	valid_0's l1: 0.0661499
[268]	valid_0's l1: 0.0661494
[269]	valid_0's l1: 0.0661489
[270]	valid_0's l1: 0.0661486
[271]	valid_0's l1: 0.0661485
[272]	valid_0's l1: 0.0661484
[273]	valid_0's l1: 0.0661481
[274]	valid_0's l1: 0.0661479
[275]	valid_0's l1: 0.0661475
[276]	valid_0's l1: 0.0661469
[277]	valid_0's l1: 0.0661463
[278]	valid_0's l1: 0.0661464
[279]	valid_0's l1: 0.0661462
[280]	valid_0's l1: 0.0661456
[281]	valid_0's l1: 0.0661457
[282]	valid_0's l1: 0.0661453
[283]	valid_0's l1: 0.0661451
[284]	valid_0's l1: 0.0661445
[285]	valid_0's l1: 0.0661434
[286]	valid_0's l1: 0.0661437
[287]	valid_0's l1: 0.0661433
[288]	valid_0's l1: 0.0661435
[289]	valid_0's l1: 0.0661428
[290]	valid_0's l1: 0.0661425
[291]	valid_0's l1: 0.0661422
[292]	valid_0's l1: 0.0661422
[293]	valid_0's l1: 0.0661418
[294]	valid_0's l1: 0.0661415
[295]	valid_0's l1: 0.0661412
[296]	valid_0's l1: 0.0661407
[297]	valid_0's l1: 0.0661406
[298]	valid_0's l1: 0.06614
[299]	valid_0's l1: 0.0661404
[300]	valid_0's l1: 0.0661402
[301]	valid_0's l1: 0.0661393
[302]	valid_0's l1: 0.0661396
[303]	valid_0's l1: 0.0661387
[304]	valid_0's l1: 0.0661384
[305]	valid_0's l1: 0.0661377
[306]	valid_0's l1: 0.0661372
[307]	valid_0's l1: 0.0661366
[308]	valid_0's l1: 0.0661359
[309]	valid_0's l1: 0.0661352
[310]	valid_0's l1: 0.0661349
[311]	valid_0's l1: 0.0661344
[312]	valid_0's l1: 0.0661342
[313]	valid_0's l1: 0.0661336
[314]	valid_0's l1: 0.0661338
[315]	valid_0's l1: 0.0661336
[316]	valid_0's l1: 0.0661338
[317]	valid_0's l1: 0.0661336
[318]	valid_0's l1: 0.0661333
[319]	valid_0's l1: 0.0661328
[320]	valid_0's l1: 0.0661326
[321]	valid_0's l1: 0.0661318
[322]	valid_0's l1: 0.0661313
[323]	valid_0's l1: 0.0661314
[324]	valid_0's l1: 0.0661317
[325]	valid_0's l1: 0.0661315
[326]	valid_0's l1: 0.0661318
[327]	valid_0's l1: 0.0661316
[328]	valid_0's l1: 0.066131
[329]	valid_0's l1: 0.0661302
[330]	valid_0's l1: 0.0661299
[331]	valid_0's l1: 0.0661295
[332]	valid_0's l1: 0.066129
[333]	valid_0's l1: 0.0661283
[334]	valid_0's l1: 0.0661283
[335]	valid_0's l1: 0.0661276
[336]	valid_0's l1: 0.0661271
[337]	valid_0's l1: 0.0661268
[338]	valid_0's l1: 0.0661262
[339]	valid_0's l1: 0.066126
[340]	valid_0's l1: 0.0661259
[341]	valid_0's l1: 0.0661258
[342]	valid_0's l1: 0.0661256
[343]	valid_0's l1: 0.0661252
[344]	valid_0's l1: 0.0661252
[345]	valid_0's l1: 0.0661249
[346]	valid_0's l1: 0.0661245
[347]	valid_0's l1: 0.0661243
[348]	valid_0's l1: 0.0661238
[349]	valid_0's l1: 0.0661236
[350]	valid_0's l1: 0.0661238
[351]	valid_0's l1: 0.0661234
[352]	valid_0's l1: 0.066123
[353]	valid_0's l1: 0.0661228
[354]	valid_0's l1: 0.0661226
[355]	valid_0's l1: 0.0661224
[356]	valid_0's l1: 0.066122
[357]	valid_0's l1: 0.0661221
[358]	valid_0's l1: 0.0661217
[359]	valid_0's l1: 0.0661217
[360]	valid_0's l1: 0.0661207
[361]	valid_0's l1: 0.066121
[362]	valid_0's l1: 0.0661207
[363]	valid_0's l1: 0.0661207
[364]	valid_0's l1: 0.0661203
[365]	valid_0's l1: 0.0661199
[366]	valid_0's l1: 0.0661191
[367]	valid_0's l1: 0.066119
[368]	valid_0's l1: 0.0661185
[369]	valid_0's l1: 0.0661185
[370]	valid_0's l1: 0.0661181
[371]	valid_0's l1: 0.0661183
[372]	valid_0's l1: 0.0661183
[373]	valid_0's l1: 0.0661182
[374]	valid_0's l1: 0.0661182
[375]	valid_0's l1: 0.0661173
[376]	valid_0's l1: 0.0661176
[377]	valid_0's l1: 0.0661174
[378]	valid_0's l1: 0.0661173
[379]	valid_0's l1: 0.0661174
[380]	valid_0's l1: 0.0661176
[381]	valid_0's l1: 0.066118
[382]	valid_0's l1: 0.0661182
[383]	valid_0's l1: 0.0661179
[384]	valid_0's l1: 0.0661176
[385]	valid_0's l1: 0.0661169
[386]	valid_0's l1: 0.0661168
[387]	valid_0's l1: 0.0661169
[388]	valid_0's l1: 0.0661173
[389]	valid_0's l1: 0.0661167
[390]	valid_0's l1: 0.0661169
[391]	valid_0's l1: 0.0661161
[392]	valid_0's l1: 0.0661154
[393]	valid_0's l1: 0.0661158
[394]	valid_0's l1: 0.0661158
[395]	valid_0's l1: 0.0661154
[396]	valid_0's l1: 0.0661153
[397]	valid_0's l1: 0.0661151
[398]	valid_0's l1: 0.0661147
[399]	valid_0's l1: 0.0661142
[400]	valid_0's l1: 0.0661143
[401]	valid_0's l1: 0.0661144
[402]	valid_0's l1: 0.0661147
[403]	valid_0's l1: 0.0661148
[404]	valid_0's l1: 0.0661151
[405]	valid_0's l1: 0.0661147
[406]	valid_0's l1: 0.0661151
[407]	valid_0's l1: 0.0661151
[408]	valid_0's l1: 0.0661145
[409]	valid_0's l1: 0.0661143
[410]	valid_0's l1: 0.0661143
[411]	valid_0's l1: 0.0661143
[412]	valid_0's l1: 0.0661144
[413]	valid_0's l1: 0.0661142
[414]	valid_0's l1: 0.0661143
[415]	valid_0's l1: 0.0661146
[416]	valid_0's l1: 0.066115
[417]	valid_0's l1: 0.066115
[418]	valid_0's l1: 0.0661149
[419]	valid_0's l1: 0.0661149
[420]	valid_0's l1: 0.0661152
[421]	valid_0's l1: 0.0661156
[422]	valid_0's l1: 0.0661156
[423]	valid_0's l1: 0.0661159
[424]	valid_0's l1: 0.0661164
[425]	valid_0's l1: 0.0661164
[426]	valid_0's l1: 0.0661164
[427]	valid_0's l1: 0.0661162
[428]	valid_0's l1: 0.0661168
[429]	valid_0's l1: 0.0661167
[430]	valid_0's l1: 0.0661168
[431]	valid_0's l1: 0.0661174
[432]	valid_0's l1: 0.0661179
[433]	valid_0's l1: 0.0661176
[434]	valid_0's l1: 0.0661177
[435]	valid_0's l1: 0.0661184
[436]	valid_0's l1: 0.0661183
[437]	valid_0's l1: 0.0661192
[438]	valid_0's l1: 0.0661197
[439]	valid_0's l1: 0.06612
[440]	valid_0's l1: 0.0661197
[441]	valid_0's l1: 0.0661199
[442]	valid_0's l1: 0.0661202
[443]	valid_0's l1: 0.0661201
[444]	valid_0's l1: 0.0661201
[445]	valid_0's l1: 0.0661202
[446]	valid_0's l1: 0.0661199
[447]	valid_0's l1: 0.06612
[448]	valid_0's l1: 0.06612
[449]	valid_0's l1: 0.0661203
[450]	valid_0's l1: 0.0661206
[451]	valid_0's l1: 0.0661209
[452]	valid_0's l1: 0.066121
[453]	valid_0's l1: 0.0661213
[454]	valid_0's l1: 0.0661217
[455]	valid_0's l1: 0.0661217
[456]	valid_0's l1: 0.0661223
[457]	valid_0's l1: 0.0661225
[458]	valid_0's l1: 0.0661225
[459]	valid_0's l1: 0.0661226
[460]	valid_0's l1: 0.0661226
[461]	valid_0's l1: 0.0661226
[462]	valid_0's l1: 0.0661226
[463]	valid_0's l1: 0.0661222
[464]	valid_0's l1: 0.0661225
[465]	valid_0's l1: 0.0661227
[466]	valid_0's l1: 0.066123
[467]	valid_0's l1: 0.0661234
[468]	valid_0's l1: 0.066123
[469]	valid_0's l1: 0.066123
[470]	valid_0's l1: 0.0661233
[471]	valid_0's l1: 0.0661229
[472]	valid_0's l1: 0.0661233
[473]	valid_0's l1: 0.0661233
[474]	valid_0's l1: 0.0661235
[475]	valid_0's l1: 0.0661234
[476]	valid_0's l1: 0.0661237
[477]	valid_0's l1: 0.0661237
[478]	valid_0's l1: 0.0661237
[479]	valid_0's l1: 0.0661238
[480]	valid_0's l1: 0.0661238
[481]	valid_0's l1: 0.0661245
[482]	valid_0's l1: 0.0661246
[483]	valid_0's l1: 0.0661248
[484]	valid_0's l1: 0.0661252
[485]	valid_0's l1: 0.0661255
[486]	valid_0's l1: 0.0661254
[487]	valid_0's l1: 0.0661261
[488]	valid_0's l1: 0.0661261
[489]	valid_0's l1: 0.0661264
[490]	valid_0's l1: 0.0661267
[491]	valid_0's l1: 0.0661267
[492]	valid_0's l1: 0.0661268
[493]	valid_0's l1: 0.0661277
[494]	valid_0's l1: 0.0661284
[495]	valid_0's l1: 0.0661291
[496]	valid_0's l1: 0.0661289
[497]	valid_0's l1: 0.0661291
[498]	valid_0's l1: 0.0661293
[499]	valid_0's l1: 0.0661297
[500]	valid_0's l1: 0.0661295
Prepare for the prediction ...
Predicting on validation set ...
Validation set MAE: 0.0661294616677
Start prediction ...
Start write result ...
In [50]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import gc

print('Loading data ...')
train = pd.read_csv('train_2016_v2.csv')
prop  = pd.read_csv('properties_2016.csv')

for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

df_train = train.merge(prop, how='left', on='parcelid')

x_train = df_train.drop(['parcelid',
                         'logerror',
                         'transactiondate',
                         'propertyzoningdesc',
                         'propertycountylandusecode'], axis=1)

x_train = x_train[['calculatedfinishedsquarefeet','structuretaxvaluedollarcnt','latitude','longitude','taxvaluedollarcnt',\
                   'yearbuilt','taxamount','lotsizesquarefeet','landtaxvaluedollarcnt','bathroomcnt']]
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)

train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

del df_train; gc.collect()

split = 80000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]
x_train = x_train.values.astype(np.float32, copy=False)
x_valid = x_valid.values.astype(np.float32, copy=False)

d_train = lgb.Dataset(x_train, label=y_train)
d_valid = lgb.Dataset(x_valid, label=y_valid)

params = {
    'max_bin'         : 20,
    'learning_rate'   : 0.0021,        # shrinkage_rate
    'boosting_type'   : 'gbdt',
    'objective'       : 'regression',
    'metric'          : 'l1',          # or 'mae'
    'sub_feature'     : 0.5,           # feature_fraction
    'bagging_fraction': 0.85,          # sub_row
    'bagging_freq'    : 40,
    'num_leaves'      : 512,           # num_leaf
    'min_data'        : 500,           # min_data_in_leaf
    'min_hessian'     : 0.05,          # min_sum_hessian_in_leaf
          }


watchlist = [d_valid]
clf       = lgb.train(params, d_train, 500, watchlist)

del d_train, d_valid; gc.collect()
del x_train; gc.collect()  #, x_valid

print("Prepare for the prediction ...")
sample = pd.read_csv('sample_submission.csv')
sample['parcelid'] = sample['ParcelId']

df_test = sample.merge(prop, on='parcelid', how='left')
del sample, prop; gc.collect()

x_test = df_test[train_columns]
del df_test; gc.collect()

for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)
x_test = x_test.values.astype(np.float32, copy=False)

print('Predicting on validation set ...')
pred = clf.predict(x_valid)
print('Validation set MAE: '+ str(mean_absolute_error(y_valid, pred)))

print("Start prediction ...")
# num_threads > 1 will predict very slow in kernel
clf.reset_parameter({"num_threads":1})
p_test = clf.predict(x_test)
p_test = 0.97*p_test + 0.03*0.011

del x_test; gc.collect()

print("Start write result ...")
sub = pd.read_csv('sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test

sub.to_csv('lgb_starter_f.csv', index=False, float_format='%.4f')
Loading data ...
((90275, 10), (90275L,))
[1]	valid_0's l1: 0.0665483
[2]	valid_0's l1: 0.0665464
[3]	valid_0's l1: 0.0665439
[4]	valid_0's l1: 0.0665418
[5]	valid_0's l1: 0.0665406
[6]	valid_0's l1: 0.0665381
[7]	valid_0's l1: 0.0665359
[8]	valid_0's l1: 0.0665339
[9]	valid_0's l1: 0.0665319
[10]	valid_0's l1: 0.0665298
[11]	valid_0's l1: 0.066528
[12]	valid_0's l1: 0.0665259
[13]	valid_0's l1: 0.0665238
[14]	valid_0's l1: 0.0665219
[15]	valid_0's l1: 0.0665195
[16]	valid_0's l1: 0.0665168
[17]	valid_0's l1: 0.0665153
[18]	valid_0's l1: 0.0665136
[19]	valid_0's l1: 0.0665104
[20]	valid_0's l1: 0.0665084
[21]	valid_0's l1: 0.0665057
[22]	valid_0's l1: 0.0665035
[23]	valid_0's l1: 0.0665017
[24]	valid_0's l1: 0.0664999
[25]	valid_0's l1: 0.0664979
[26]	valid_0's l1: 0.0664955
[27]	valid_0's l1: 0.0664938
[28]	valid_0's l1: 0.0664918
[29]	valid_0's l1: 0.0664901
[30]	valid_0's l1: 0.066488
[31]	valid_0's l1: 0.0664858
[32]	valid_0's l1: 0.0664843
[33]	valid_0's l1: 0.0664826
[34]	valid_0's l1: 0.0664806
[35]	valid_0's l1: 0.0664792
[36]	valid_0's l1: 0.0664775
[37]	valid_0's l1: 0.0664752
[38]	valid_0's l1: 0.0664731
[39]	valid_0's l1: 0.0664709
[40]	valid_0's l1: 0.0664688
[41]	valid_0's l1: 0.0664677
[42]	valid_0's l1: 0.0664659
[43]	valid_0's l1: 0.0664634
[44]	valid_0's l1: 0.0664613
[45]	valid_0's l1: 0.0664606
[46]	valid_0's l1: 0.0664579
[47]	valid_0's l1: 0.0664561
[48]	valid_0's l1: 0.0664535
[49]	valid_0's l1: 0.0664518
[50]	valid_0's l1: 0.06645
[51]	valid_0's l1: 0.0664479
[52]	valid_0's l1: 0.0664461
[53]	valid_0's l1: 0.0664449
[54]	valid_0's l1: 0.0664435
[55]	valid_0's l1: 0.0664415
[56]	valid_0's l1: 0.0664391
[57]	valid_0's l1: 0.0664376
[58]	valid_0's l1: 0.0664364
[59]	valid_0's l1: 0.0664354
[60]	valid_0's l1: 0.0664333
[61]	valid_0's l1: 0.066431
[62]	valid_0's l1: 0.066429
[63]	valid_0's l1: 0.0664265
[64]	valid_0's l1: 0.0664251
[65]	valid_0's l1: 0.066424
[66]	valid_0's l1: 0.0664225
[67]	valid_0's l1: 0.066421
[68]	valid_0's l1: 0.0664198
[69]	valid_0's l1: 0.0664184
[70]	valid_0's l1: 0.066417
[71]	valid_0's l1: 0.0664161
[72]	valid_0's l1: 0.0664142
[73]	valid_0's l1: 0.0664133
[74]	valid_0's l1: 0.0664116
[75]	valid_0's l1: 0.0664097
[76]	valid_0's l1: 0.0664087
[77]	valid_0's l1: 0.0664067
[78]	valid_0's l1: 0.0664051
[79]	valid_0's l1: 0.0664029
[80]	valid_0's l1: 0.0664016
[81]	valid_0's l1: 0.0664005
[82]	valid_0's l1: 0.0663989
[83]	valid_0's l1: 0.066397
[84]	valid_0's l1: 0.0663948
[85]	valid_0's l1: 0.0663929
[86]	valid_0's l1: 0.0663916
[87]	valid_0's l1: 0.0663892
[88]	valid_0's l1: 0.0663876
[89]	valid_0's l1: 0.0663863
[90]	valid_0's l1: 0.066385
[91]	valid_0's l1: 0.0663838
[92]	valid_0's l1: 0.0663825
[93]	valid_0's l1: 0.0663805
[94]	valid_0's l1: 0.0663793
[95]	valid_0's l1: 0.0663784
[96]	valid_0's l1: 0.066377
[97]	valid_0's l1: 0.066376
[98]	valid_0's l1: 0.066374
[99]	valid_0's l1: 0.0663734
[100]	valid_0's l1: 0.0663713
[101]	valid_0's l1: 0.0663696
[102]	valid_0's l1: 0.0663676
[103]	valid_0's l1: 0.066366
[104]	valid_0's l1: 0.0663641
[105]	valid_0's l1: 0.0663628
[106]	valid_0's l1: 0.0663611
[107]	valid_0's l1: 0.0663591
[108]	valid_0's l1: 0.0663583
[109]	valid_0's l1: 0.0663573
[110]	valid_0's l1: 0.066356
[111]	valid_0's l1: 0.0663549
[112]	valid_0's l1: 0.0663529
[113]	valid_0's l1: 0.066352
[114]	valid_0's l1: 0.0663508
[115]	valid_0's l1: 0.0663492
[116]	valid_0's l1: 0.0663485
[117]	valid_0's l1: 0.0663467
[118]	valid_0's l1: 0.0663457
[119]	valid_0's l1: 0.0663443
[120]	valid_0's l1: 0.066343
[121]	valid_0's l1: 0.0663413
[122]	valid_0's l1: 0.0663405
[123]	valid_0's l1: 0.0663395
[124]	valid_0's l1: 0.0663381
[125]	valid_0's l1: 0.0663368
[126]	valid_0's l1: 0.0663352
[127]	valid_0's l1: 0.0663343
[128]	valid_0's l1: 0.0663338
[129]	valid_0's l1: 0.0663328
[130]	valid_0's l1: 0.0663317
[131]	valid_0's l1: 0.0663305
[132]	valid_0's l1: 0.0663287
[133]	valid_0's l1: 0.0663279
[134]	valid_0's l1: 0.0663267
[135]	valid_0's l1: 0.0663257
[136]	valid_0's l1: 0.0663247
[137]	valid_0's l1: 0.0663233
[138]	valid_0's l1: 0.0663226
[139]	valid_0's l1: 0.0663219
[140]	valid_0's l1: 0.0663212
[141]	valid_0's l1: 0.0663203
[142]	valid_0's l1: 0.0663185
[143]	valid_0's l1: 0.0663177
[144]	valid_0's l1: 0.0663171
[145]	valid_0's l1: 0.0663164
[146]	valid_0's l1: 0.0663155
[147]	valid_0's l1: 0.0663143
[148]	valid_0's l1: 0.0663131
[149]	valid_0's l1: 0.0663116
[150]	valid_0's l1: 0.0663108
[151]	valid_0's l1: 0.0663097
[152]	valid_0's l1: 0.0663093
[153]	valid_0's l1: 0.0663079
[154]	valid_0's l1: 0.0663072
[155]	valid_0's l1: 0.0663063
[156]	valid_0's l1: 0.0663054
[157]	valid_0's l1: 0.0663039
[158]	valid_0's l1: 0.0663025
[159]	valid_0's l1: 0.0663022
[160]	valid_0's l1: 0.0663012
[161]	valid_0's l1: 0.0662998
[162]	valid_0's l1: 0.0662997
[163]	valid_0's l1: 0.0662983
[164]	valid_0's l1: 0.066297
[165]	valid_0's l1: 0.0662959
[166]	valid_0's l1: 0.0662945
[167]	valid_0's l1: 0.0662936
[168]	valid_0's l1: 0.0662927
[169]	valid_0's l1: 0.0662915
[170]	valid_0's l1: 0.0662907
[171]	valid_0's l1: 0.0662898
[172]	valid_0's l1: 0.066289
[173]	valid_0's l1: 0.0662879
[174]	valid_0's l1: 0.0662868
[175]	valid_0's l1: 0.0662862
[176]	valid_0's l1: 0.0662852
[177]	valid_0's l1: 0.0662839
[178]	valid_0's l1: 0.0662827
[179]	valid_0's l1: 0.0662818
[180]	valid_0's l1: 0.0662808
[181]	valid_0's l1: 0.0662806
[182]	valid_0's l1: 0.0662793
[183]	valid_0's l1: 0.0662778
[184]	valid_0's l1: 0.0662768
[185]	valid_0's l1: 0.0662766
[186]	valid_0's l1: 0.0662758
[187]	valid_0's l1: 0.0662747
[188]	valid_0's l1: 0.0662744
[189]	valid_0's l1: 0.0662738
[190]	valid_0's l1: 0.0662729
[191]	valid_0's l1: 0.0662721
[192]	valid_0's l1: 0.0662715
[193]	valid_0's l1: 0.0662704
[194]	valid_0's l1: 0.0662692
[195]	valid_0's l1: 0.0662685
[196]	valid_0's l1: 0.0662682
[197]	valid_0's l1: 0.0662681
[198]	valid_0's l1: 0.0662672
[199]	valid_0's l1: 0.0662663
[200]	valid_0's l1: 0.0662652
[201]	valid_0's l1: 0.066264
[202]	valid_0's l1: 0.066263
[203]	valid_0's l1: 0.0662626
[204]	valid_0's l1: 0.0662622
[205]	valid_0's l1: 0.0662611
[206]	valid_0's l1: 0.0662602
[207]	valid_0's l1: 0.066259
[208]	valid_0's l1: 0.0662582
[209]	valid_0's l1: 0.066258
[210]	valid_0's l1: 0.0662573
[211]	valid_0's l1: 0.0662564
[212]	valid_0's l1: 0.0662555
[213]	valid_0's l1: 0.0662549
[214]	valid_0's l1: 0.0662539
[215]	valid_0's l1: 0.0662533
[216]	valid_0's l1: 0.0662527
[217]	valid_0's l1: 0.0662522
[218]	valid_0's l1: 0.0662513
[219]	valid_0's l1: 0.0662504
[220]	valid_0's l1: 0.0662499
[221]	valid_0's l1: 0.0662486
[222]	valid_0's l1: 0.0662479
[223]	valid_0's l1: 0.066247
[224]	valid_0's l1: 0.0662465
[225]	valid_0's l1: 0.0662462
[226]	valid_0's l1: 0.0662455
[227]	valid_0's l1: 0.0662446
[228]	valid_0's l1: 0.066244
[229]	valid_0's l1: 0.0662433
[230]	valid_0's l1: 0.0662428
[231]	valid_0's l1: 0.0662419
[232]	valid_0's l1: 0.0662415
[233]	valid_0's l1: 0.0662402
[234]	valid_0's l1: 0.0662392
[235]	valid_0's l1: 0.0662383
[236]	valid_0's l1: 0.0662375
[237]	valid_0's l1: 0.0662369
[238]	valid_0's l1: 0.0662364
[239]	valid_0's l1: 0.0662357
[240]	valid_0's l1: 0.066235
[241]	valid_0's l1: 0.0662345
[242]	valid_0's l1: 0.0662339
[243]	valid_0's l1: 0.0662331
[244]	valid_0's l1: 0.0662329
[245]	valid_0's l1: 0.0662323
[246]	valid_0's l1: 0.0662318
[247]	valid_0's l1: 0.0662314
[248]	valid_0's l1: 0.0662307
[249]	valid_0's l1: 0.0662299
[250]	valid_0's l1: 0.0662289
[251]	valid_0's l1: 0.0662277
[252]	valid_0's l1: 0.0662272
[253]	valid_0's l1: 0.0662274
[254]	valid_0's l1: 0.0662267
[255]	valid_0's l1: 0.0662266
[256]	valid_0's l1: 0.066226
[257]	valid_0's l1: 0.0662255
[258]	valid_0's l1: 0.0662246
[259]	valid_0's l1: 0.0662242
[260]	valid_0's l1: 0.066224
[261]	valid_0's l1: 0.0662236
[262]	valid_0's l1: 0.0662228
[263]	valid_0's l1: 0.0662219
[264]	valid_0's l1: 0.0662216
[265]	valid_0's l1: 0.0662208
[266]	valid_0's l1: 0.0662205
[267]	valid_0's l1: 0.0662202
[268]	valid_0's l1: 0.0662194
[269]	valid_0's l1: 0.0662191
[270]	valid_0's l1: 0.0662187
[271]	valid_0's l1: 0.0662191
[272]	valid_0's l1: 0.0662188
[273]	valid_0's l1: 0.0662178
[274]	valid_0's l1: 0.0662172
[275]	valid_0's l1: 0.066217
[276]	valid_0's l1: 0.0662167
[277]	valid_0's l1: 0.0662164
[278]	valid_0's l1: 0.0662159
[279]	valid_0's l1: 0.0662149
[280]	valid_0's l1: 0.0662146
[281]	valid_0's l1: 0.0662142
[282]	valid_0's l1: 0.066214
[283]	valid_0's l1: 0.0662136
[284]	valid_0's l1: 0.0662129
[285]	valid_0's l1: 0.0662124
[286]	valid_0's l1: 0.0662121
[287]	valid_0's l1: 0.0662116
[288]	valid_0's l1: 0.066211
[289]	valid_0's l1: 0.0662107
[290]	valid_0's l1: 0.0662105
[291]	valid_0's l1: 0.0662097
[292]	valid_0's l1: 0.0662094
[293]	valid_0's l1: 0.0662085
[294]	valid_0's l1: 0.0662087
[295]	valid_0's l1: 0.0662084
[296]	valid_0's l1: 0.0662084
[297]	valid_0's l1: 0.0662084
[298]	valid_0's l1: 0.0662083
[299]	valid_0's l1: 0.0662075
[300]	valid_0's l1: 0.0662078
[301]	valid_0's l1: 0.0662075
[302]	valid_0's l1: 0.0662069
[303]	valid_0's l1: 0.0662061
[304]	valid_0's l1: 0.0662062
[305]	valid_0's l1: 0.0662063
[306]	valid_0's l1: 0.0662058
[307]	valid_0's l1: 0.0662061
[308]	valid_0's l1: 0.0662056
[309]	valid_0's l1: 0.0662056
[310]	valid_0's l1: 0.066206
[311]	valid_0's l1: 0.0662064
[312]	valid_0's l1: 0.0662064
[313]	valid_0's l1: 0.066206
[314]	valid_0's l1: 0.0662057
[315]	valid_0's l1: 0.0662056
[316]	valid_0's l1: 0.0662052
[317]	valid_0's l1: 0.0662052
[318]	valid_0's l1: 0.0662055
[319]	valid_0's l1: 0.0662055
[320]	valid_0's l1: 0.0662059
[321]	valid_0's l1: 0.0662058
[322]	valid_0's l1: 0.0662057
[323]	valid_0's l1: 0.0662058
[324]	valid_0's l1: 0.0662056
[325]	valid_0's l1: 0.0662056
[326]	valid_0's l1: 0.066205
[327]	valid_0's l1: 0.0662046
[328]	valid_0's l1: 0.0662044
[329]	valid_0's l1: 0.0662047
[330]	valid_0's l1: 0.0662045
[331]	valid_0's l1: 0.0662042
[332]	valid_0's l1: 0.066204
[333]	valid_0's l1: 0.0662034
[334]	valid_0's l1: 0.0662032
[335]	valid_0's l1: 0.0662028
[336]	valid_0's l1: 0.0662022
[337]	valid_0's l1: 0.0662022
[338]	valid_0's l1: 0.0662024
[339]	valid_0's l1: 0.0662021
[340]	valid_0's l1: 0.0662015
[341]	valid_0's l1: 0.0662019
[342]	valid_0's l1: 0.0662009
[343]	valid_0's l1: 0.0662009
[344]	valid_0's l1: 0.0662007
[345]	valid_0's l1: 0.0662005
[346]	valid_0's l1: 0.0662003
[347]	valid_0's l1: 0.0662008
[348]	valid_0's l1: 0.0662009
[349]	valid_0's l1: 0.0662007
[350]	valid_0's l1: 0.0662004
[351]	valid_0's l1: 0.0662006
[352]	valid_0's l1: 0.0662003
[353]	valid_0's l1: 0.0662006
[354]	valid_0's l1: 0.0662009
[355]	valid_0's l1: 0.0662011
[356]	valid_0's l1: 0.0662013
[357]	valid_0's l1: 0.0662016
[358]	valid_0's l1: 0.0662015
[359]	valid_0's l1: 0.0662013
[360]	valid_0's l1: 0.0662013
[361]	valid_0's l1: 0.0662005
[362]	valid_0's l1: 0.0662004
[363]	valid_0's l1: 0.0661999
[364]	valid_0's l1: 0.0662
[365]	valid_0's l1: 0.0661995
[366]	valid_0's l1: 0.0662
[367]	valid_0's l1: 0.0661998
[368]	valid_0's l1: 0.0661998
[369]	valid_0's l1: 0.066199
[370]	valid_0's l1: 0.0661987
[371]	valid_0's l1: 0.0661988
[372]	valid_0's l1: 0.0661985
[373]	valid_0's l1: 0.0661979
[374]	valid_0's l1: 0.0661978
[375]	valid_0's l1: 0.0661982
[376]	valid_0's l1: 0.0661981
[377]	valid_0's l1: 0.066198
[378]	valid_0's l1: 0.066198
[379]	valid_0's l1: 0.0661979
[380]	valid_0's l1: 0.0661984
[381]	valid_0's l1: 0.0661983
[382]	valid_0's l1: 0.0661986
[383]	valid_0's l1: 0.0661988
[384]	valid_0's l1: 0.0661987
[385]	valid_0's l1: 0.0661984
[386]	valid_0's l1: 0.0661982
[387]	valid_0's l1: 0.0661987
[388]	valid_0's l1: 0.0661983
[389]	valid_0's l1: 0.0661985
[390]	valid_0's l1: 0.0661987
[391]	valid_0's l1: 0.0661988
[392]	valid_0's l1: 0.0661987
[393]	valid_0's l1: 0.0661985
[394]	valid_0's l1: 0.0661992
[395]	valid_0's l1: 0.0661995
[396]	valid_0's l1: 0.0661996
[397]	valid_0's l1: 0.0662001
[398]	valid_0's l1: 0.0661999
[399]	valid_0's l1: 0.0661999
[400]	valid_0's l1: 0.0662001
[401]	valid_0's l1: 0.0662006
[402]	valid_0's l1: 0.066201
[403]	valid_0's l1: 0.0662011
[404]	valid_0's l1: 0.0662012
[405]	valid_0's l1: 0.066201
[406]	valid_0's l1: 0.066201
[407]	valid_0's l1: 0.066201
[408]	valid_0's l1: 0.0662015
[409]	valid_0's l1: 0.0662013
[410]	valid_0's l1: 0.0662009
[411]	valid_0's l1: 0.0662008
[412]	valid_0's l1: 0.0662009
[413]	valid_0's l1: 0.0662014
[414]	valid_0's l1: 0.0662011
[415]	valid_0's l1: 0.0662011
[416]	valid_0's l1: 0.0662015
[417]	valid_0's l1: 0.0662015
[418]	valid_0's l1: 0.0662015
[419]	valid_0's l1: 0.0662018
[420]	valid_0's l1: 0.0662021
[421]	valid_0's l1: 0.0662024
[422]	valid_0's l1: 0.0662021
[423]	valid_0's l1: 0.0662025
[424]	valid_0's l1: 0.0662027
[425]	valid_0's l1: 0.0662023
[426]	valid_0's l1: 0.0662026
[427]	valid_0's l1: 0.0662022
[428]	valid_0's l1: 0.0662021
[429]	valid_0's l1: 0.0662024
[430]	valid_0's l1: 0.066202
[431]	valid_0's l1: 0.0662023
[432]	valid_0's l1: 0.0662023
[433]	valid_0's l1: 0.0662021
[434]	valid_0's l1: 0.066202
[435]	valid_0's l1: 0.0662019
[436]	valid_0's l1: 0.066202
[437]	valid_0's l1: 0.0662018
[438]	valid_0's l1: 0.0662018
[439]	valid_0's l1: 0.0662014
[440]	valid_0's l1: 0.0662018
[441]	valid_0's l1: 0.066202
[442]	valid_0's l1: 0.0662016
[443]	valid_0's l1: 0.0662015
[444]	valid_0's l1: 0.0662018
[445]	valid_0's l1: 0.066202
[446]	valid_0's l1: 0.0662019
[447]	valid_0's l1: 0.0662021
[448]	valid_0's l1: 0.0662028
[449]	valid_0's l1: 0.0662021
[450]	valid_0's l1: 0.0662023
[451]	valid_0's l1: 0.0662021
[452]	valid_0's l1: 0.0662017
[453]	valid_0's l1: 0.066202
[454]	valid_0's l1: 0.0662024
[455]	valid_0's l1: 0.066202
[456]	valid_0's l1: 0.0662027
[457]	valid_0's l1: 0.0662025
[458]	valid_0's l1: 0.0662026
[459]	valid_0's l1: 0.0662023
[460]	valid_0's l1: 0.0662023
[461]	valid_0's l1: 0.0662028
[462]	valid_0's l1: 0.0662028
[463]	valid_0's l1: 0.0662032
[464]	valid_0's l1: 0.0662036
[465]	valid_0's l1: 0.0662038
[466]	valid_0's l1: 0.0662034
[467]	valid_0's l1: 0.0662038
[468]	valid_0's l1: 0.0662042
[469]	valid_0's l1: 0.0662044
[470]	valid_0's l1: 0.0662048
[471]	valid_0's l1: 0.0662052
[472]	valid_0's l1: 0.0662049
[473]	valid_0's l1: 0.0662053
[474]	valid_0's l1: 0.0662055
[475]	valid_0's l1: 0.0662055
[476]	valid_0's l1: 0.0662056
[477]	valid_0's l1: 0.0662058
[478]	valid_0's l1: 0.0662057
[479]	valid_0's l1: 0.0662058
[480]	valid_0's l1: 0.0662057
[481]	valid_0's l1: 0.0662056
[482]	valid_0's l1: 0.0662058
[483]	valid_0's l1: 0.0662057
[484]	valid_0's l1: 0.0662059
[485]	valid_0's l1: 0.0662054
[486]	valid_0's l1: 0.0662051
[487]	valid_0's l1: 0.0662044
[488]	valid_0's l1: 0.0662043
[489]	valid_0's l1: 0.0662039
[490]	valid_0's l1: 0.0662039
[491]	valid_0's l1: 0.0662037
[492]	valid_0's l1: 0.0662041
[493]	valid_0's l1: 0.0662039
[494]	valid_0's l1: 0.066204
[495]	valid_0's l1: 0.0662038
[496]	valid_0's l1: 0.0662036
[497]	valid_0's l1: 0.0662037
[498]	valid_0's l1: 0.0662041
[499]	valid_0's l1: 0.066204
[500]	valid_0's l1: 0.0662045
Prepare for the prediction ...
Predicting on validation set ...
Validation set MAE: 0.0662044535913
Start prediction ...
Start write result ...
In [ ]: